Count Outcome Models with Stan

References

Load packages

library(tidyverse)
## devtools::install_github('jburos/biostan', build_vignettes = TRUE, dependencies = TRUE)
library(rstan)
## To avoid recompilation of unchanged Stan programs, we recommend calling
rstan_options(auto_write = TRUE)
##
library(bayesplot)
##
library(loo)
## Seed from random.org
set.seed(673788956)

Load data

This dataset contains information on the number of days absent from school for each student along with the student’s math test score and the type of program that the student belonged to.

data1 <- haven::read_dta("https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta")

data1 <- data1 %>%
    mutate(prog = factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational")),
           id = factor(id))
data1

## # A tibble: 314 x 5
##    id    gender     math daysabs prog      
##    <fct> <dbl+lbl> <dbl>   <dbl> <fct>     
##  1 1001  2            63       4 Academic  
##  2 1002  2            27       4 Academic  
##  3 1003  1            20       2 Academic  
##  4 1004  1            16       3 Academic  
##  5 1005  1             2       3 Academic  
##  6 1006  1            71      13 Academic  
##  7 1007  1            63      11 Academic  
##  8 1008  2             3       7 Academic  
##  9 1009  2            51      10 Academic  
## 10 1010  2            49       9 Vocational
## # ... with 304 more rows

## For counting
data1_count <- data1 %>%
    rename(y = daysabs) %>%
    count(y)
data1_count

## # A tibble: 30 x 2
##        y     n
##    <dbl> <int>
##  1     0    57
##  2     1    41
##  3     2    28
##  4     3    27
##  5     4    25
##  6     5    20
##  7     6    16
##  8     7    14
##  9     8    10
## 10     9    13
## # ... with 20 more rows

Examine distribution

The distribution is quite wide spread.

data1 %>%
    ggplot(mapping = aes(x = daysabs, fill = prog)) +
    geom_bar(stat = "count") +
    facet_grid(prog ~ ., margin = TRUE, scales = "free_y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

A rough check of mean-variance comparison indicates presence of overdispersion although this does not account for the continuous math test score variable.

data1 %>%
    group_by(prog) %>%
    summarize(mean = mean(daysabs),
              var = var(daysabs))

## # A tibble: 3 x 3
##   prog        mean   var
##   <fct>      <dbl> <dbl>
## 1 General    10.6   67.3
## 2 Academic    6.93  55.4
## 3 Vocational  2.67  13.9

Design matrix

We will assume the DAE design matrix is sufficient for modeling covariates. A numerical matrix is required for rstan.

X <- model.matrix(object = daysabs ~ math + prog, data = data1)
head(X, n = 10)

##    (Intercept) math progAcademic progVocational
## 1            1   63            1              0
## 2            1   27            1              0
## 3            1   20            1              0
## 4            1   16            1              0
## 5            1    2            1              0
## 6            1   71            1              0
## 7            1   63            1              0
## 8            1    3            1              0
## 9            1   51            1              0
## 10           1   49            0              1

y <- data1$daysabs

Helper functions

We will use these function to extract and visualize posterior predictive distributions.

##
extract_post_pred <- function(stan_fit) {
    tidybayes::tidy_draws(stan_fit) %>%
        select(.chain, .iteration, .draw, starts_with("y_new")) %>%
        gather(key = key, value = value, starts_with("y_new")) %>%
        mutate(key = gsub("y_new|\\[|\\]", "", key) %>% as.integer())
}
##
plot_draws <- function(stan_fit, n_sample, data_count = data1_count) {
    draw_data <- extract_post_pred(stan_fit)
    sample_indices <- sample(seq_len(max(draw_data$.iteration)), size = n_sample)
    draw_data %>%
        group_by(.chain, .iteration, .draw) %>%
        count(value) %>%
        filter(.iteration %in% sample_indices) %>%
        ggplot(mapping = aes(x = value, y = n, group = interaction(.chain, .iteration, .draw))) +
        ## Plot random draws from posterior
        geom_line(alpha = 0.5, size = 0.1) +
        ## Include actual data distribution
        geom_line(data = data_count, color = "gray", alpha = 0.7, size = 2,
                  mapping = aes(x = y, group = NULL)) +
        facet_wrap( ~ .chain) +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
              legend.key = element_blank(),
              plot.title = element_text(hjust = 0.5),
              strip.background = element_blank())
}

Poisson model without covariates

This is a Poisson model without any covariates. The MCMC diagnostics are good. The model is a poor fit as seen in the discrepancy between posterior predictive distributions and the data distribution. \[\begin{align*} &\text{Prior}\\ \lambda | \alpha, \beta &\sim Gamma(\alpha, \beta)\\ \\ &\text{Likelihood}\\ y_{i} | \lambda &\overset{\text{iid}}{\sim} Poisson(\lambda)\\ \end{align*} \]

stan_code_poisson <- biostan::read_stan_file("./bmspe_poisson.stan")

## Warning: replacing previous import 'rstan::loo' by 'rstanarm::loo' when loading 'biostan'

biostan::print_stan_code(stan_code_poisson, section = NULL)

## data { 
##     /* Hyperparameters*/ 
##     real<lower=0> a; 
##     real<lower=0> b; 
##  
##     /* Dimensions */ 
##     int<lower=0> N; 
##     int<lower=0> M; 
##     /* Design Matrix */ 
##     matrix[N,M] X; 
##     /* Outcome (a real vector of length n) */ 
##     int<lower=0> y[N]; 
## } 
##  
## parameters { 
##     real<lower=0> lambda; 
## } 
##  
## model { 
##     /* Prior */ 
##     /* lambda ~ gamma(a, b); */ 
##     /* Explicit contribution to target */ 
##     target += gamma_lpdf(lambda | a, b); 
##  
##     /* Likelihood */ 
##     /* y ~ poisson(lambda); */ 
##     /* Explicit contribution to target */ 
##     for (i in 1:N) { 
##         target += poisson_lpmf(y[i] | lambda); 
##     } 
## } 
##  
## generated quantities { 
##     int<lower=0> y_new[N]; 
##     vector[N] log_lik; 
##      
##     for (i in 1:N) { 
##         y_new[i] = poisson_rng(lambda); 
##         log_lik[i] = poisson_lpmf(y[i] | lambda); 
##     } 
## } 
##

stan_model_poisson <- stan(model_code = stan_code_poisson,
                           data = list(a = 10^(-3), b = 10^(-3),
                                       N = nrow(X), M = ncol(X),
                                       X = X,
                                       y = y),
                           chains = n_cores)

check_hmc_diagnostics(stan_model_poisson)

## 
## Divergences:
## 
## Tree depth:
## 
## Energy:

pars <- c("lambda","lp__")
print(stan_model_poisson, pars = pars)

## Inference for Stan model: 843946174fd41dafb52150aa37e93e57.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##            mean se_mean   sd     2.5%      25%      50%      75%    97.5% n_eff Rhat
## lambda     5.95    0.00 0.14     5.69     5.86     5.95     6.04     6.23  4430    1
## lp__   -1557.93    0.01 0.70 -1559.96 -1558.09 -1557.65 -1557.48 -1557.43  5340    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:20:12 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

pairs(stan_model_poisson, pars = pars)

traceplot(stan_model_poisson, inc_warmup = TRUE, pars = pars)

plot_draws(stan_model_poisson, n_sample = 20)

loo(stan_model_poisson)

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate    SE
## elpd_loo  -1555.2  92.9
## p_loo         8.3   1.1
## looic      3110.3 185.8
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

Poisson model with covariates

Including covariates slightly improved the fit and the posterior predictive distributions are closer to the data distribution. However, the zero part is not accounted for well. \[\begin{align*} &\text{Prior}\\ \boldsymbol{\beta} &\sim MVN(\boldsymbol{0}, s\mathbf{I})\\ \\ &\text{Likelihood}\\ \eta_{i} &= \mathbf{X}_{i}^{T}\boldsymbol{\beta}\\ \mu_{i} &= e^{\eta_{i}}\\ y_{i} | \mu_{i} &\overset{\text{ind}}{\sim} Poisson(\mu_{i})\\ \end{align*} \]

stan_code_poisson_covs <- biostan::read_stan_file("./bmspe_poisson_covs.stan")
biostan::print_stan_code(stan_code_poisson_covs, section = NULL)

## data { 
##     /* Hyperparameters*/ 
##     real<lower=0> s; 
##  
##     /* Dimensions */ 
##     int<lower=0> N; 
##     int<lower=0> M; 
##     /* Design Matrix */ 
##     matrix[N,M] X; 
##     /* Outcome */ 
##     int<lower=0> y[N]; 
## } 
##  
## parameters { 
##     vector[M] beta; 
## } 
##  
## transformed parameters { 
##     vector[N] eta; 
##  
##     eta = X * beta; 
## } 
##  
## model { 
##     /* Prior */ 
##     for (j in 1:M) { 
##         target += normal_lpdf(beta[j] | 0, s); 
##     } 
##  
##     /* Likelihood */ 
##     /* y_i ~ poisson(exp(eta_i)); */ 
##     for (i in 1:N) { 
##         target += poisson_log_lpmf(y[i] | eta[i]); 
##     } 
## } 
##  
## generated quantities { 
##     int<lower=0> y_new[N]; 
##     vector[N] log_lik; 
##      
##     for (i in 1:N) { 
##         if (eta[i] > 20) { 
##             /* To avoid erros like the below during the warmup. */ 
##             /* [2] "  Exception: poisson_log_rng: Log rate parameter is 69.8999, but must be less than 20.7944  (in 'modelb25632c451a8_0ba3e86968d73bc8913ee68b8ce5542b' at line 40)" */ 
##             /* Check posterior predictive. */ 
##             y_new[i] = poisson_log_rng(20); 
##         } else { 
##             y_new[i] = poisson_log_rng(eta[i]); 
##         } 
##  
##         log_lik[i] = poisson_log_lpmf(y[i] | eta[i]); 
##     } 
## } 
##

stan_model_poisson_covs <- stan(model_code = stan_code_poisson_covs,
                                data = list(s = 10,
                                            N = nrow(X), M = ncol(X),
                                            X = X,
                                            y = y),
                                chains = n_cores)

check_hmc_diagnostics(stan_model_poisson_covs)

## 
## Divergences:
## 
## Tree depth:
## 
## Energy:

pars <- c("beta","lp__")
print(stan_model_poisson_covs, pars = pars)

## Inference for Stan model: 1bf021e5d6a2cfcbeea4a9c4dd4e7110.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##             mean se_mean   sd     2.5%      25%      50%      75%    97.5% n_eff Rhat
## beta[1]     2.65    0.00 0.06     2.53     2.61     2.65     2.69     2.77  3957    1
## beta[2]    -0.01    0.00 0.00    -0.01    -0.01    -0.01    -0.01    -0.01  8260    1
## beta[3]    -0.44    0.00 0.06    -0.55    -0.48    -0.44    -0.40    -0.33  3929    1
## beta[4]    -1.28    0.00 0.08    -1.44    -1.34    -1.28    -1.23    -1.13  4375    1
## lp__    -1343.58    0.02 1.41 -1347.15 -1344.27 -1343.26 -1342.54 -1341.81  4648    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:21:15 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

pairs(stan_model_poisson_covs, pars = pars)

traceplot(stan_model_poisson_covs, inc_warmup = TRUE, pars = pars)

plot_draws(stan_model_poisson_covs, n_sample = 20)

loo(stan_model_poisson_covs)

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate    SE
## elpd_loo  -1343.5  74.5
## p_loo        25.1   3.4
## looic      2686.9 149.1
## ------
## Monte Carlo SE of elpd_loo is 0.1.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

We are using vague priors on the regression coefficients, so the results are similar to the corresponding Frequentist fit.

## Frequentist Poisson fit for comparison
summary(glm(formula = daysabs ~ math + prog, data = data1, family = poisson(link = "log")))

## 
## Call:
## glm(formula = daysabs ~ math + prog, family = poisson(link = "log"), 
##     data = data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2597  -2.2038  -0.9193   0.6511   7.4233  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.651974   0.060736  43.664  < 2e-16 ***
## math           -0.006808   0.000931  -7.313 2.62e-13 ***
## progAcademic   -0.439897   0.056672  -7.762 8.35e-15 ***
## progVocational -1.281364   0.077886 -16.452  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2217.7  on 313  degrees of freedom
## Residual deviance: 1774.0  on 310  degrees of freedom
## AIC: 2665.3
## 
## Number of Fisher Scoring iterations: 5

Zero-inflated Poisson model with covariates

The zero-inflated Poisson model a two-part model that models two different types of zeros. Structural zeros coming from a subpopulation who can only have zeros and Poisson zeros coming from subpopulation who can have non-zero values but by chance had zeros. In this example of days absent from school, this may not be a feasible model. \[\begin{align*} &\text{Prior}\\ \boldsymbol{\beta} &\sim MVN(\boldsymbol{0}, \text{diag}(s))\\ \boldsymbol{\beta}_{\theta} &\sim MVN(\boldsymbol{0}, \text{diag}(s_{\theta}))\\ \\ &\text{Likelihood}\\ \eta_{\theta,i} &= \mathbf{X}_{i}^{T}\boldsymbol{\beta}_{\theta}\\ \mu_{\theta,i} &= \text{expit}(\eta_{i})\\ z_{i} | \mu_{\theta,i} &\overset{\text{ind}}{\sim} Bernoulli(\mu_{\theta,i})\\ \\ \eta_{i} &= \mathbf{X}_{i}^{T}\boldsymbol{\beta}\\ \mu_{i} &= e^{\eta_{i}}\\ y_{i} | \mu_{i},z_{i} &\overset{\text{ind}}{\sim} (1 - z_{i}) Poisson(\mu_{i})\\ \end{align*} \]

stan_code_poisson_covs_zip <- biostan::read_stan_file("./bmspe_poisson_covs_zip.stan")
biostan::print_stan_code(stan_code_poisson_covs_zip, section = NULL)

## /* https://discourse.mc-stan.org/t/learning-how-to-write-a-zero-inflated-poisson-model-with-stan/6726 */ 
## data { 
##     /* Dimensions */ 
##     int<lower=0> N; 
##     int<lower=0> M; 
##     /* Design Matrix */ 
##     matrix[N,M] X; 
##     /* Outcome */ 
##     int<lower=0> y[N]; 
##     /* Hyperparameters*/ 
##     real<lower=0> s[M]; 
##     real<lower=0> s_theta[M]; 
## } 
##  
## parameters { 
##     vector[M] beta; 
##     vector[M] beta_theta; 
## } 
##  
## transformed parameters { 
##     vector[N] eta; 
##     vector[N] eta_theta; 
##  
##     eta = X * beta; 
##     eta_theta = X * beta_theta; 
## } 
##  
## model { 
##     /* Prior */ 
##     for (j in 1:M) { 
##         target += normal_lpdf(beta[j] | 0, s[j]); 
##         target += normal_lpdf(beta_theta[j] | 0, s_theta[j]); 
##     } 
##  
##     /* Likelihood */ 
##     /* https://github.com/paul-buerkner/brms/blob/master/inst/chunks/fun_zero_inflated_poisson.stan */ 
##     for (i in 1:N) { 
##         if (y[i] == 0) { 
##             /* Zero case */ 
##             target += log_sum_exp(/* Structural zero */ 
##                                   bernoulli_logit_lpmf(1 | eta_theta[i]), 
##                                   /* Poisson zero */ 
##                                   bernoulli_logit_lpmf(0 | eta_theta[i]) + 
##                                   poisson_log_lpmf(0 | eta[i])); 
##         } else { 
##             /* Non-zero case */ 
##             /* First term means not structural zero. */ 
##             target += bernoulli_logit_lpmf(0 | eta_theta[i]) + 
##                 /* y[i] is relevant only here. */ 
##                 poisson_log_lpmf(y[i] | eta[i]); 
##         } 
##     } 
## } 
##  
## generated quantities { 
##     int y_new[N]; 
##     vector[N] log_lik; 
##  
##     for (i in 1:N) { 
##  
##         if (bernoulli_logit_rng(eta_theta[i]) == 1) { 
##             /* Structural zero */ 
##             y_new[i] = 0; 
##         } else { 
##             /* Not structural zero */ 
##             if (eta[i] > 20) { 
##                 /* To avoid erros like the below during the warmup. */ 
##                 /* Check posterior predictive. */ 
##                 y_new[i] = poisson_log_rng(20); 
##             } else { 
##                 y_new[i] = poisson_log_rng(eta[i]); 
##             } 
##         } 
##  
##  
##         if (y[i] == 0) { 
##             /* Zero case */ 
##             log_lik[i] = log_sum_exp(/* Structural zero */ 
##                                      bernoulli_logit_lpmf(1 | eta_theta[i]), 
##                                      /* Poisson zero */ 
##                                      bernoulli_logit_lpmf(0 | eta_theta[i]) + 
##                                      poisson_log_lpmf(0 | eta[i])); 
##         } else { 
##             /* Non-zero case */ 
##             /* First term means not structural zero. */ 
##             log_lik[i] = bernoulli_logit_lpmf(0 | eta_theta[i]) + 
##                 /* y[i] is relevant only here. */ 
##                 poisson_log_lpmf(y[i] | eta[i]); 
##         } 
##  
##     } 
## } 
##

stan_model_poisson_covs_zip <- stan(model_code = stan_code_poisson_covs_zip,
                                    ## Regularize binomial part to avoid divergent iterations.
                                    data = list(s = rep(10,ncol(X)), s_theta = rep(10,ncol(X)),
                                                N = nrow(X), M = ncol(X),
                                                X = X,
                                                y = y),
                                    chains = n_cores)

## Warning: There were 601 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

## Warning: There were 1747 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded

## Warning: Examine the pairs() plot to diagnose sampling problems

check_hmc_diagnostics(stan_model_poisson_covs_zip)

## 
## Divergences:
## 
## Tree depth:
## 
## Energy:

pars <- c("beta","beta_theta","lp__")
print(stan_model_poisson_covs_zip, pars = pars)

## Inference for Stan model: d0f831067dd936a77b47661b4f6b0c98.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##                   mean se_mean    sd     2.5%      25%      50%      75%    97.5% n_eff  Rhat
## beta[1]           2.61    0.01  0.07     2.48     2.56     2.61     2.65     2.75    30  1.18
## beta[2]          -0.01    0.00  0.00    -0.01    -0.01    -0.01     0.00     0.00    13  1.37
## beta[3]          -0.34    0.02  0.08    -0.50    -0.39    -0.33    -0.28    -0.20    13  1.45
## beta[4]          -1.03    0.06  0.16    -1.38    -1.14    -0.98    -0.91    -0.80     8  2.13
## beta_theta[1]    -6.16    1.34  5.39   -14.95    -9.28    -6.81    -4.62     8.00    16  1.30
## beta_theta[2]    -2.04    1.43  4.35   -14.99    -0.10     0.01     0.02     0.03     9  1.77
## beta_theta[3]     6.15    0.24  4.00     0.83     3.41     5.38     8.14    15.77   275  1.04
## beta_theta[4]     4.81    1.17  6.32   -12.97     3.39     5.48     8.05    14.86    29  1.15
## lp__          -1254.89   22.69 55.63 -1354.10 -1262.35 -1223.35 -1221.72 -1220.01     6 27.49
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:24:16 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

pairs(stan_model_poisson_covs_zip, pars = pars)

traceplot(stan_model_poisson_covs_zip, inc_warmup = TRUE, pars = pars)

plot_draws(stan_model_poisson_covs_zip, n_sample = 20)

loo(stan_model_poisson_covs_zip)

## Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate    SE
## elpd_loo  -1320.7  69.0
## p_loo       147.7  15.1
## looic      2641.4 138.0
## ------
## Monte Carlo SE of elpd_loo is NA.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     312   99.4%   2         
##  (0.5, 0.7]   (ok)         0    0.0%   <NA>      
##    (0.7, 1]   (bad)        2    0.6%   0         
##    (1, Inf)   (very bad)   0    0.0%   <NA>      
## See help('pareto-k-diagnostic') for details.

The model did not converge with vague priors on both sets of coefficients. The pairs plot shows straight line patterns among coefficients fro the Bernoulli model coefficients. As seen in the barplots of the data, the General program does not have any individuals with zero days absent from school. This may be causing a complete separation issue. Taking this as a data sparsity issue, we will examine a model that regularizes more heavily the non-intercept coefficients for the Bernoulli model.

stan_model_poisson_covs_zip2 <- stan(model_code = stan_code_poisson_covs_zip,
                                     ## Regularize binomial part to avoid divergent iterations.
                                     data = list(s = rep(10,ncol(X)), s_theta = c(10,rep(2, ncol(X)-1)),
                                                 N = nrow(X), M = ncol(X),
                                                 X = X,
                                                 y = y),
                                     chains = n_cores)

check_hmc_diagnostics(stan_model_poisson_covs_zip2)

## 
## Divergences:
## 
## Tree depth:
## 
## Energy:

pars <- c("beta","beta_theta","lp__")
print(stan_model_poisson_covs_zip2, pars = pars)

## Inference for Stan model: d0f831067dd936a77b47661b4f6b0c98.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##                   mean se_mean   sd     2.5%      25%      50%      75%    97.5% n_eff Rhat
## beta[1]           2.59    0.00 0.06     2.47     2.55     2.59     2.63     2.70  6940    1
## beta[2]          -0.01    0.00 0.00    -0.01    -0.01    -0.01     0.00     0.00 12619    1
## beta[3]          -0.30    0.00 0.06    -0.42    -0.34    -0.30    -0.26    -0.19  6791    1
## beta[4]          -0.94    0.00 0.08    -1.10    -0.99    -0.94    -0.89    -0.78  7814    1
## beta_theta[1]    -4.40    0.01 0.91    -6.36    -4.98    -4.34    -3.75    -2.77  5036    1
## beta_theta[2]     0.01    0.00 0.01     0.00     0.01     0.01     0.02     0.03 10619    1
## beta_theta[3]     1.87    0.01 0.85     0.38     1.27     1.81     2.40     3.73  4965    1
## beta_theta[4]     2.79    0.01 0.85     1.33     2.19     2.74     3.32     4.66  4917    1
## lp__          -1219.84    0.03 2.01 -1224.74 -1220.93 -1219.51 -1218.37 -1216.97  4796    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:26:15 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

pairs(stan_model_poisson_covs_zip2, pars = pars)

traceplot(stan_model_poisson_covs_zip2, inc_warmup = TRUE, pars = pars)

plot_draws(stan_model_poisson_covs_zip2, n_sample = 20)

loo(stan_model_poisson_covs_zip2)

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate    SE
## elpd_loo  -1210.2  64.2
## p_loo        25.4   3.2
## looic      2420.5 128.5
## ------
## Monte Carlo SE of elpd_loo is 0.1.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

The regularized model converged. The narrow linear patterns among the Bernoulli model coefficients are probably due to strong correlation. Being either in an Academic program (coefficient 3) or Vocational program (coefficient 4) is a strong predictor of having zero absent days. So when one is strong, the other is also strong. When the coefficients for these are strong, the intercept (coefficient 1) has to be very small.

## Frequentist ZIP fit for comparison
## https://stats.idre.ucla.edu/r/dae/zip/
summary(pscl::zeroinfl(formula = daysabs ~ math + prog | math + prog, data = data1))

## 
## Call:
## pscl::zeroinfl(formula = daysabs ~ math + prog | math + prog, data = data1)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3410 -1.1702 -0.6026  0.5864  7.3907 
## 
## Count model coefficients (poisson with log link):
##                  Estimate Std. Error z value     Pr(>|z|)    
## (Intercept)     2.5879178  0.0614173  42.137      < 2e-16 ***
## math           -0.0052060  0.0009291  -5.603 0.0000000210 ***
## progAcademic   -0.3040946  0.0567438  -5.359 0.0000000836 ***
## progVocational -0.9397286  0.0794916 -11.822      < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                   Estimate  Std. Error z value Pr(>|z|)  
## (Intercept)     -19.203567 1705.841440  -0.011    0.991  
## math              0.012639    0.006926   1.825    0.068 .
## progAcademic     16.737788 1705.841413   0.010    0.992  
## progVocational   17.660593 1705.841412   0.010    0.992  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 13 
## Log-likelihood: -1192 on 8 Df

As expected, a non-regularized Frequentist model breaks down with very high standard errors for Bernoulli model coefficients.

Poisson model with covariates and gamma random effects

\[\begin{align*} &\text{Prior}\\ \boldsymbol{\beta} &\sim MVN(\boldsymbol{0}, s\mathbf{I})\\ a_{\gamma} &\sim Gamma(a, b)\\ \\ &\text{Likelihood}\\ \eta_{i} &= \mathbf{X}_{i}^{T}\boldsymbol{\beta}\\ \mu_{i} &= e^{\eta_{i}}\\ \gamma_{i} | a_{\gamma} &\overset{\text{iid}}{\sim} Gamma(1/a_{\gamma},1/a_{\gamma}) ~~ E[\gamma_{i}] = 1, Var(\gamma_{i}) = a_{\gamma}\\ y_{i} | \gamma_{i},\mu_{i} &\overset{\text{ind}}{\sim} Poisson(\gamma_{i} \mu_{i})\\ \end{align*} \]

stan_code_poisson_covs_gamma <- biostan::read_stan_file("./bmspe_poisson_covs_gamma.stan")
biostan::print_stan_code(stan_code_poisson_covs_gamma, section = NULL)

## data { 
##     /* Hyperparameters*/ 
##     real<lower=0> a; 
##     real<lower=0> b; 
##     real<lower=0> s; 
##  
##     /* Dimensions */ 
##     int<lower=0> N; 
##     int<lower=0> M; 
##     /* Design Matrix */ 
##     matrix[N,M] X; 
##     /* Outcome */ 
##     int<lower=0> y[N]; 
## } 
##  
## parameters { 
##     vector[M] beta; 
##     vector<lower=0>[N] gamma; 
##     real<lower=0> a_gamma; 
## } 
##  
## transformed parameters { 
##     vector[N] eta; 
##     vector<lower=0>[N] mu; 
##  
##     eta = X * beta; 
##     mu = exp(eta); 
## } 
##  
## model { 
##     /* Prior */ 
##     for (j in 1:M) { 
##         /* beta_j ~ N(0, s) */ 
##         target += normal_lpdf(beta[j] | 0, s); 
##     } 
##     /* a_gamma ~ Gamma(a, b) */ 
##     target += gamma_lpdf(a_gamma | a, b); 
##  
##     /* Likelihood */ 
##     for (i in 1:N) { 
##         /* gamma_i ~ Gamma(a_gamma, bb) */ 
##         /* E[gamma_i] = 1 must be met for identifiability. */ 
##         target += gamma_lpdf(gamma[i] | 1/a_gamma, 1/a_gamma); 
##         /* y_i ~ poisson(gamma_i * mu_i); */ 
##         target += poisson_lpmf(y[i] | gamma[i] * mu[i]); 
##     } 
## } 
##  
## generated quantities { 
##     int y_new[N]; 
##     vector[N] log_lik; 
##  
##     for (i in 1:N) { 
##         if (gamma[i] * mu[i] > 1e+09) { 
##             /* To avoid erros like the below during the warmup. */ 
##             /* Exception: poisson_rng: Rate parameter is 1.92222e+40, but must be less than 1.07374e+09 */ 
##             /* Check posterior predictive. */ 
##             y_new[i] = -1; 
##         } else { 
##             y_new[i] = poisson_rng(gamma[i] * mu[i]); 
##         } 
##  
##         log_lik[i] = gamma_lpdf(gamma[i] | 1/a_gamma, 1/a_gamma) + poisson_lpmf(y[i] | gamma[i] * mu[i]); 
##     } 
## } 
##

stan_model_poisson_covs_gamma <- stan(model_code = stan_code_poisson_covs_gamma,
                                      data = list(a = 10^(-3), b = 10^(-3),
                                                  s = 10,
                                                  N = nrow(X), M = ncol(X),
                                                  X = X,
                                                  y = y),
                                      chains = n_cores)

check_hmc_diagnostics(stan_model_poisson_covs_gamma)

## 
## Divergences:
## 
## Tree depth:
## 
## Energy:

pars <- c("beta","a_gamma","lp__")
print(stan_model_poisson_covs_gamma, pars = pars)

## Inference for Stan model: 6c9eb83e4c4e607ea8864804bdd7b685.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##             mean se_mean    sd     2.5%      25%      50%      75%    97.5% n_eff Rhat
## beta[1]     2.63    0.00  0.20     2.24     2.49     2.62     2.76     3.03  2494 1.01
## beta[2]    -0.01    0.00  0.00    -0.01    -0.01    -0.01     0.00     0.00  3709 1.00
## beta[3]    -0.45    0.00  0.19    -0.82    -0.57    -0.44    -0.32    -0.09  2387 1.00
## beta[4]    -1.28    0.00  0.21    -1.70    -1.42    -1.28    -1.14    -0.88  2970 1.00
## a_gamma     0.99    0.00  0.10     0.80     0.92     0.99     1.06     1.21  5969 1.00
## lp__    -1118.41    0.27 15.21 -1149.22 -1128.47 -1118.26 -1108.09 -1089.71  3159 1.00
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:28:26 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

pairs(stan_model_poisson_covs_gamma, pars = pars)

traceplot(stan_model_poisson_covs_gamma, inc_warmup = TRUE, pars = pars)

plot_draws(stan_model_poisson_covs_gamma, n_sample = 20)

loo(stan_model_poisson_covs_gamma)

## Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## Warning in log(z): NaNs produced

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1224.5 25.1
## p_loo       370.3  6.0
## looic      2448.9 50.1
## ------
## Monte Carlo SE of elpd_loo is NA.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)       0    0.0%   <NA>      
##  (0.5, 0.7]   (ok)         4    1.3%   151       
##    (0.7, 1]   (bad)      223   71.0%   13        
##    (1, Inf)   (very bad)  87   27.7%   5         
## See help('pareto-k-diagnostic') for details.

Negative binomial model with covariates

If we marginalize over the latent individual-specific random effect \(\gamma_{i}\), we obtain the negative binomial. In the following formulation, \(a_{\gamma}\) should have the same interpretation as in the previous model. \[\begin{align*} &\text{Prior}\\ \boldsymbol{\beta} &\sim MVN(\boldsymbol{0}, s\mathbf{I})\\ a_{\gamma} &\sim Gamma(a, b)\\ \phi &= \frac{1}{a_{\gamma}}\\ \\ &\text{Likelihood}\\ \eta_{i} &= \mathbf{X}_{i}^{T}\boldsymbol{\beta}\\ \mu_{i} &= e^{\eta_{i}}\\ y_{i} | \mu_{i}, \phi &\overset{\text{ind}}{\sim} NegBin(\mu_{i}, \phi)\\ \\ &\text{Stan definition}\\ \text{NegBinomial2}(y_{i} | \mu_{i}, \phi) &= \frac{\Gamma(y_{i} + \phi)}{y_{i}!\Gamma(\phi)} \left( \frac{\mu_{i}}{\mu_{i}+\phi} \right)^{y_{i}} \left( \frac{\phi}{\mu_{i}+\phi} \right)^{\phi}\\ \end{align*} \]

## http://rstudio-pubs-static.s3.amazonaws.com/34099_2e35c3966ef548c2918d5b6c2146bfd1.html
stan_code_negbin_covs <- biostan::read_stan_file("./bmspe_negbin_covs.stan")
biostan::print_stan_code(stan_code_negbin_covs, section = NULL)

## data { 
##     /* Hyperparameters*/ 
##     real<lower=0> a; 
##     real<lower=0> b; 
##     real<lower=0> s; 
##  
##     /* Dimensions */ 
##     int<lower=0> N; 
##     int<lower=0> M; 
##     /* Design Matrix */ 
##     matrix[N,M] X; 
##     /* Outcome */ 
##     int<lower=0> y[N]; 
## } 
##  
## parameters { 
##     vector[M] beta; 
##     real<lower=0> a_gamma; 
## } 
##  
## transformed parameters { 
##     vector[N] eta; 
##     real<lower=0> phi; 
##  
##     eta = X * beta; 
##     phi = 1 / a_gamma; 
## } 
##  
## model { 
##     /* Prior */ 
##     for (j in 1:M) { 
##         target += normal_lpdf(beta[j] | 0, s); 
##     } 
##     target += gamma_lpdf(a_gamma | a, b); 
##  
##     /* Likelihood */ 
##     /* y_i ~ poisson(exp(eta_i)); */ 
##     for (i in 1:N) { 
##         /* https://mc-stan.org/docs/2_18/functions-reference/nbalt.html */ 
##         target += neg_binomial_2_lpmf(y[i] | exp(eta[i]), phi); 
##     } 
## } 
##  
## generated quantities { 
##     int y_new[N]; 
##     vector[N] log_lik; 
##  
##     for (i in 1:N) { 
##         /* eta[i] is the log(mean). */ 
##         if (eta[i] > 15) { 
##             /* To avoid erros like the below during the warmup. */ 
##             /* neg_binomial_2_rng: Random number that came from gamma distribution is 3.02668e+39, but must be less than 1.07374e+09 */ 
##             /* https://groups.google.com/forum/#!topic/stan-users/4g2hbwtRELQ */ 
##             /* Check posterior predictive for anomaly. */ 
##             y_new[i] = -1; 
##         } else { 
##             y_new[i] = neg_binomial_2_rng(exp(eta[i]), phi); 
##         } 
##  
##         log_lik[i] = neg_binomial_2_lpmf(y[i] | exp(eta[i]), phi); 
##     } 
## } 
##

stan_model_negbin_covs <- stan(model_code = stan_code_negbin_covs,
                               data = list(a = 10^(-3), b = 10^(-3),
                                           s = 10,
                                           N = nrow(X), M = ncol(X),
                                           X = X,
                                           y = y),
                                chains = n_cores)

check_hmc_diagnostics(stan_model_negbin_covs)

## 
## Divergences:
## 
## Tree depth:
## 
## Energy:

pars <- c("beta","a_gamma","phi","lp__")
print(stan_model_negbin_covs, pars = pars)

## Inference for Stan model: e98734dc2427bc4c0dad3de78b51228b.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##            mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff Rhat
## beta[1]    2.63    0.00 0.20    2.25    2.50    2.63    2.77    3.03  4524    1
## beta[2]   -0.01    0.00 0.00   -0.01   -0.01   -0.01    0.00    0.00  9593    1
## beta[3]   -0.45    0.00 0.18   -0.82   -0.58   -0.45   -0.33   -0.10  4366    1
## beta[4]   -1.29    0.00 0.20   -1.70   -1.43   -1.29   -1.16   -0.90  5033    1
## a_gamma    0.99    0.00 0.10    0.81    0.92    0.99    1.06    1.20  8674    1
## phi        1.02    0.00 0.10    0.83    0.95    1.01    1.09    1.23  8491    1
## lp__    -887.96    0.02 1.57 -891.83 -888.76 -887.64 -886.80 -885.89  4827    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:29:57 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

pairs(stan_model_negbin_covs, pars = pars)

traceplot(stan_model_negbin_covs, inc_warmup = TRUE, pars = pars)

plot_draws(stan_model_negbin_covs, n_sample = 20)

loo(stan_model_negbin_covs)

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo   -870.6 20.3
## p_loo         4.8  0.6
## looic      1741.1 40.5
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

We see the similarity in the posterior for \(a_{\gamma}\) as well as the posterior predictive plots, empirically confirming the equivalence of these two models.

## Frequentist negative binomial fit for comparison
summary(MASS::glm.nb(formula = daysabs ~ math + prog, data = data1))

## 
## Call:
## MASS::glm.nb(formula = daysabs ~ math + prog, data = data1, init.theta = 1.032713156, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1547  -1.0192  -0.3694   0.2285   2.5273  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.615265   0.197460  13.245  < 2e-16 ***
## math           -0.005993   0.002505  -2.392   0.0167 *  
## progAcademic   -0.440760   0.182610  -2.414   0.0158 *  
## progVocational -1.278651   0.200720  -6.370 1.89e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0327) family taken to be 1)
## 
##     Null deviance: 427.54  on 313  degrees of freedom
## Residual deviance: 358.52  on 310  degrees of freedom
## AIC: 1741.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.033 
##           Std. Err.:  0.106 
## 
##  2 x log-likelihood:  -1731.258

Due to the vague prior, the Frequentist fit is similar.

Comparison of Poisson with gamma random effects and negative binomial

The corresponding posterior samples are very similar between these two models.

cat("Poisson with gamma random effects\n")

## Poisson with gamma random effects

print(stan_model_poisson_covs_gamma, pars = c("beta","a_gamma"))

## Inference for Stan model: 6c9eb83e4c4e607ea8864804bdd7b685.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##          mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## beta[1]  2.63       0 0.20  2.24  2.49  2.62  2.76  3.03  2494 1.01
## beta[2] -0.01       0 0.00 -0.01 -0.01 -0.01  0.00  0.00  3709 1.00
## beta[3] -0.45       0 0.19 -0.82 -0.57 -0.44 -0.32 -0.09  2387 1.00
## beta[4] -1.28       0 0.21 -1.70 -1.42 -1.28 -1.14 -0.88  2970 1.00
## a_gamma  0.99       0 0.10  0.80  0.92  0.99  1.06  1.21  5969 1.00
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:28:26 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

cat("Negative binomial\n")

## Negative binomial

print(stan_model_negbin_covs, pars = c("beta","a_gamma"))

## Inference for Stan model: e98734dc2427bc4c0dad3de78b51228b.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##          mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## beta[1]  2.63       0 0.20  2.25  2.50  2.63  2.77  3.03  4524    1
## beta[2] -0.01       0 0.00 -0.01 -0.01 -0.01  0.00  0.00  9593    1
## beta[3] -0.45       0 0.18 -0.82 -0.58 -0.45 -0.33 -0.10  4366    1
## beta[4] -1.29       0 0.20 -1.70 -1.43 -1.29 -1.16 -0.90  5033    1
## a_gamma  0.99       0 0.10  0.81  0.92  0.99  1.06  1.20  8674    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:29:57 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Figures may be more informative.

both_draws <- bind_rows(tidybayes::tidy_draws(stan_model_poisson_covs_gamma) %>%
                        select(.chain, .iteration, .draw, starts_with("beta"), a_gamma) %>%
                        gather(key = key, value = value, starts_with("beta"), a_gamma) %>%
                        mutate(model = "Poisson Mixed"),
                        ##
                        tidybayes::tidy_draws(stan_model_negbin_covs) %>%
                        select(.chain, .iteration, .draw, starts_with("beta"), a_gamma) %>%
                        gather(key = key, value = value, starts_with("beta"), a_gamma) %>%
                        mutate(model = "Negative Binomial"))

both_draws %>%
    ggplot(mapping = aes(x = value, color = model)) +
    geom_density() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_draws %>%
    ggplot(mapping = aes(y = value, x = model, color = model)) +
    geom_boxplot() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_draws %>%
    ggplot(mapping = aes(y = value, x = model, color = model)) +
    geom_violin() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_draws %>%
    ggplot(mapping = aes(y = value, x = model, color = model)) +
    tidybayes::geom_eye() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

Posterior predictive distributions can also be compared. They look somewhat more different.

both_ppd <- bind_rows(extract_post_pred(stan_model_poisson_covs_gamma) %>%
                      mutate(model = "Poisson Mixed"),
                      extract_post_pred(stan_model_negbin_covs) %>%
                      mutate(model = "Negative Binomial"))

both_ppd %>%
    group_by(model, .chain, .iteration, .draw) %>%
    count(value) %>%
    group_by(model, value) %>%
    tidybayes::median_qi(n, .width = c(0.95, 0.80, 0.50)) %>%
    ggplot(mapping = aes(x = value, y = n)) +
    tidybayes::geom_interval() +
    geom_point(data = data1_count,
               mapping = aes(x = y, y = n)) +
    geom_line(data = data1_count,
               mapping = aes(x = y, y = n)) +
    scale_color_brewer() +
    facet_wrap( ~ model) +
    coord_cartesian(xlim = c(0, 50)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_ppd_qi <- both_ppd %>%
    group_by(model, .chain, .iteration, .draw) %>%
    count(value) %>%
    group_by(model, value) %>%
    tidybayes::median_qi(n, .width = c(0.95, 0.80, 0.50))

both_ppd_qi %>%
    ggplot(mapping = aes(x = value, y = n)) +
    geom_line() +
    geom_ribbon(data = filter(both_ppd_qi, .width == 0.95),
                mapping = aes(ymin = .lower, ymax = .upper),
                fill = "gray95") +
    geom_ribbon(data = filter(both_ppd_qi, .width == 0.80),
                mapping = aes(ymin = .lower, ymax = .upper),
                fill = "gray80") +
    geom_ribbon(data = filter(both_ppd_qi, .width == 0.50),
                mapping = aes(ymin = .lower, ymax = .upper),
                fill = "gray50") +
    geom_point(data = data1_count,
               mapping = aes(x = y, y = n)) +
    geom_line(data = data1_count,
               mapping = aes(x = y, y = n)) +
    facet_wrap( ~ model) +
    coord_cartesian(xlim = c(0, 50)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

Comparison of Poisson with negative binomial

Now we compare the goodness of fit for the Poisson model and negative binomial model.

cat("Poisson\n")

## Poisson

print(stan_model_poisson_covs, pars = c("beta"))

## Inference for Stan model: 1bf021e5d6a2cfcbeea4a9c4dd4e7110.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##          mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## beta[1]  2.65       0 0.06  2.53  2.61  2.65  2.69  2.77  3957    1
## beta[2] -0.01       0 0.00 -0.01 -0.01 -0.01 -0.01 -0.01  8260    1
## beta[3] -0.44       0 0.06 -0.55 -0.48 -0.44 -0.40 -0.33  3929    1
## beta[4] -1.28       0 0.08 -1.44 -1.34 -1.28 -1.23 -1.13  4375    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:21:15 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

cat("Negative binomial\n")

## Negative binomial

print(stan_model_negbin_covs, pars = c("beta","a_gamma"))

## Inference for Stan model: e98734dc2427bc4c0dad3de78b51228b.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##          mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## beta[1]  2.63       0 0.20  2.25  2.50  2.63  2.77  3.03  4524    1
## beta[2] -0.01       0 0.00 -0.01 -0.01 -0.01  0.00  0.00  9593    1
## beta[3] -0.45       0 0.18 -0.82 -0.58 -0.45 -0.33 -0.10  4366    1
## beta[4] -1.29       0 0.20 -1.70 -1.43 -1.29 -1.16 -0.90  5033    1
## a_gamma  0.99       0 0.10  0.81  0.92  0.99  1.06  1.20  8674    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jan  1 07:29:57 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

As seen here posterior means are very similar for the corresponding parameters. Again figures may be more informative.

both_draws <- bind_rows(tidybayes::tidy_draws(stan_model_poisson_covs) %>%
                        select(.chain, .iteration, .draw, starts_with("beta")) %>%
                        gather(key = key, value = value, starts_with("beta")) %>%
                        mutate(model = "Poisson"),
                        ##
                        tidybayes::tidy_draws(stan_model_negbin_covs) %>%
                        select(.chain, .iteration, .draw, starts_with("beta"), a_gamma) %>%
                        gather(key = key, value = value, starts_with("beta"), a_gamma) %>%
                        mutate(model = "Negative Binomial"))

both_draws %>%
    ggplot(mapping = aes(x = value, color = model)) +
    geom_density() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_draws %>%
    ggplot(mapping = aes(y = value, x = model, color = model)) +
    geom_boxplot() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_draws %>%
    ggplot(mapping = aes(y = value, x = model, color = model)) +
    geom_violin() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_draws %>%
    ggplot(mapping = aes(y = value, x = model, color = model)) +
    tidybayes::geom_eye() +
    facet_wrap( ~ key, scales = "free") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

Now we can better appreciate the more wide spread posteriors for the corresponding parameters.

Posterior predictive distributions can also be compared.

both_ppd <- bind_rows(extract_post_pred(stan_model_poisson_covs) %>%
                      mutate(model = "Poisson"),
                      extract_post_pred(stan_model_negbin_covs) %>%
                      mutate(model = "Negative Binomial"))

both_ppd %>%
    group_by(model, .chain, .iteration, .draw) %>%
    count(value) %>%
    group_by(model, value) %>%
    tidybayes::median_qi(n, .width = c(0.95, 0.80, 0.50)) %>%
    ggplot(mapping = aes(x = value, y = n)) +
    tidybayes::geom_interval() +
    geom_point(data = data1_count,
               mapping = aes(x = y, y = n)) +
    geom_line(data = data1_count,
               mapping = aes(x = y, y = n)) +
    scale_color_brewer() +
    facet_wrap( ~ model) +
    coord_cartesian(xlim = c(0, 50)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

both_ppd_qi <- both_ppd %>%
    group_by(model, .chain, .iteration, .draw) %>%
    count(value) %>%
    group_by(model, value) %>%
    tidybayes::median_qi(n, .width = c(0.95, 0.80, 0.50))

both_ppd_qi %>%
    ggplot(mapping = aes(x = value, y = n)) +
    geom_line() +
    geom_ribbon(data = filter(both_ppd_qi, .width == 0.95),
                mapping = aes(ymin = .lower, ymax = .upper),
                fill = "gray95") +
    geom_ribbon(data = filter(both_ppd_qi, .width == 0.80),
                mapping = aes(ymin = .lower, ymax = .upper),
                fill = "gray80") +
    geom_ribbon(data = filter(both_ppd_qi, .width == 0.50),
                mapping = aes(ymin = .lower, ymax = .upper),
                fill = "gray50") +
    geom_point(data = data1_count,
               mapping = aes(x = y, y = n)) +
    geom_line(data = data1_count,
               mapping = aes(x = y, y = n)) +
    facet_wrap( ~ model) +
    coord_cartesian(xlim = c(0, 50)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

In this case, the Poisson model fits data apparently poorly. We can compare these models more formally using WAIC.

cat("WAIC Poisson")

## WAIC Poisson

waic_stan_model_poisson_covs <- loo::waic(loo::extract_log_lik(stan_model_poisson_covs))

## Warning: 18 (5.7%) p_waic estimates greater than 0.4. We recommend trying loo instead.

waic_stan_model_poisson_covs

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##           Estimate    SE
## elpd_waic  -1343.5  74.5
## p_waic        25.1   3.4
## waic        2686.9 149.1

## Warning: 18 (5.7%) p_waic estimates greater than 0.4. We recommend trying loo instead.

cat("WAIC negative binomial")

## WAIC negative binomial

waic_stan_model_negbin_covs <- loo::waic(loo::extract_log_lik(stan_model_negbin_covs))
waic_stan_model_negbin_covs

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##           Estimate   SE
## elpd_waic   -870.6 20.3
## p_waic         4.8  0.6
## waic        1741.1 40.5

cat("WAIC comparison")

## WAIC comparison

loo::compare(waic_stan_model_poisson_covs,
             waic_stan_model_negbin_covs)

## elpd_diff        se 
##     472.9      61.2

The expected log pointwise predictive density (elpd) is higher for the negative binomial model, suggesting a better fit.

We can also use loo.

cat("LOO Poisson")

## LOO Poisson

loo_stan_model_poisson_covs <- loo::loo(loo::extract_log_lik(stan_model_poisson_covs))

## Warning: Relative effective sample sizes ('r_eff' argument) not specified.
## For models fit with MCMC, the reported PSIS effective sample sizes and 
## MCSE estimates will be over-optimistic.

loo_stan_model_poisson_covs

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate    SE
## elpd_loo  -1343.5  74.5
## p_loo        25.1   3.4
## looic      2686.9 149.1
## ------
## Monte Carlo SE of elpd_loo is 0.1.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

cat("LOO negative binomial")

## LOO negative binomial

loo_stan_model_negbin_covs <- loo::loo(loo::extract_log_lik(stan_model_negbin_covs))

## Warning: Relative effective sample sizes ('r_eff' argument) not specified.
## For models fit with MCMC, the reported PSIS effective sample sizes and 
## MCSE estimates will be over-optimistic.

loo_stan_model_negbin_covs

## 
## Computed from 12000 by 314 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo   -870.6 20.3
## p_loo         4.8  0.6
## looic      1741.1 40.5
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

cat("LOO comparison")

## LOO comparison

loo::compare(loo_stan_model_poisson_covs,
             loo_stan_model_negbin_covs)

## elpd_diff        se 
##     472.9      61.2

The results seem similar in this case.

Top Page: http://rpubs.com/kaz_yos/
Github: https://github.com/kaz-yos

print(sessionInfo())

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2     loo_2.0.0          bayesplot_1.6.0    rstan_2.18.1       StanHeaders_2.18.0
##  [6] forcats_0.3.0      stringr_1.3.1      dplyr_0.7.7        purrr_0.2.5        readr_1.1.1       
## [11] tidyr_0.8.2        tibble_1.4.2       ggplot2_3.1.0      tidyverse_1.2.1    doRNG_1.7.1       
## [16] rngtools_1.3.1     pkgmaker_0.27      registry_0.5       doParallel_1.0.14  iterators_1.0.10  
## [21] foreach_1.4.4      knitr_1.20        
## 
## loaded via a namespace (and not attached):
##   [1] minqa_1.2.4               colorspace_1.3-2          ggridges_0.5.1            rsconnect_0.8.8          
##   [5] rprojroot_1.3-2           ggstance_0.3.1            markdown_0.8              base64enc_0.1-3          
##   [9] rstudioapi_0.8            svUnit_0.7-12             DT_0.4                    fansi_0.4.0              
##  [13] lubridate_1.7.4           xml2_1.2.0                codetools_0.2-15          splines_3.5.1            
##  [17] pscl_1.5.2                shinythemes_1.1.1         jsonlite_1.5              nloptr_1.2.1             
##  [21] broom_0.5.0               shiny_1.1.0               compiler_3.5.1            httr_1.3.1               
##  [25] backports_1.1.2           assertthat_0.2.0          Matrix_1.2-14             lazyeval_0.2.1           
##  [29] cli_1.0.1                 later_0.7.5               htmltools_0.3.6           prettyunits_1.0.2        
##  [33] tools_3.5.1               igraph_1.2.2              coda_0.19-2               gtable_0.2.0             
##  [37] glue_1.3.0                reshape2_1.4.3            Rcpp_0.12.19              cellranger_1.1.0         
##  [41] nlme_3.1-137              crosstalk_1.0.0           ps_1.2.0                  lme4_1.1-18-1            
##  [45] rvest_0.3.2               mime_0.6                  miniUI_0.1.1.1            tidybayes_1.0.3          
##  [49] gtools_3.8.1              MASS_7.3-51               zoo_1.8-4                 scales_1.0.0             
##  [53] rstanarm_2.18.1           colourpicker_1.0          hms_0.4.2                 promises_1.0.1           
##  [57] inline_0.3.15             RColorBrewer_1.1-2        shinystan_2.5.0           yaml_2.2.0               
##  [61] gridExtra_2.3             stringi_1.2.4             dygraphs_1.1.1.6          pkgbuild_1.0.2           
##  [65] bibtex_0.4.2              rlang_0.3.0.1             pkgconfig_2.0.2           matrixStats_0.54.0       
##  [69] evaluate_0.12             lattice_0.20-35           bindr_0.1.1               splines2_0.2.8           
##  [73] rstantools_1.5.1          htmlwidgets_1.3           labeling_0.3              processx_3.2.0           
##  [77] tidyselect_0.2.5          biostan_0.1.0             plyr_1.8.4                magrittr_1.5             
##  [81] R6_2.3.0                  pillar_1.3.0              haven_1.1.2               withr_2.1.2              
##  [85] xts_0.11-1                survival_2.43-1           modelr_0.1.2              crayon_1.3.4             
##  [89] arrayhelpers_1.0-20160527 KernSmooth_2.23-15        utf8_1.1.4                rmarkdown_1.10           
##  [93] grid_3.5.1                readxl_1.1.0              callr_3.0.0               threejs_0.3.1            
##  [97] digest_0.6.18             xtable_1.8-3              httpuv_1.4.5              stats4_3.5.1             
## [101] munsell_0.5.0             shinyjs_1.0

## Record execution time and multicore use
end_time <- Sys.time()
diff_time <- difftime(end_time, start_time, units = "auto")
cat("Started  ", as.character(start_time), "\n",
    "Finished ", as.character(end_time), "\n",
    "Time difference of ", diff_time, " ", attr(diff_time, "units"), "\n",
    "Used ", foreach::getDoParWorkers(), " cores\n",
    "Used ", foreach::getDoParName(), " as backend\n",
    sep = "")

## Started  2019-01-01 07:19:29
## Finished 2019-01-01 07:31:38
## Time difference of 12.14532 mins
## Used 12 cores
## Used doParallelMC as backend