Beta Regression in Stan

References

Load packages

library(tidyverse)
## https://github.com/jburos/biostan
## devtools::install_github('jburos/biostan', build_vignettes=TRUE, dependencies=TRUE)
library(biostan)

## Warning: replacing previous import 'rstan::loo' by 'rstanarm::loo' when loading 'biostan'

library(rstan)

Load data

FoodExpenditure            package:betareg             R Documentation
Proportion of Household Income Spent on Food
Description:
     Data on proportion of income spent on food for a random sample of
     38 households in a large US city.
Usage:
     data("FoodExpenditure")
Format:
     A data frame containing 38 observations on 3 variables.
     food household expenditures for food.
     income household income.
     persons number of persons living in household.
Source:
     Taken from Griffiths et al. (1993, Table 15.4).

data(FoodExpenditure, package = "betareg")
FoodExpenditure <-
    FoodExpenditure %>%
    as_tibble() %>%
    mutate(p_food = food / income)
FoodExpenditure

## # A tibble: 38 x 4
##     food income persons p_food
##    <dbl>  <dbl>   <int>  <dbl>
##  1 16.0    62.5       1  0.256
##  2 16.7    82.3       5  0.202
##  3 21.7    74.7       3  0.291
##  4  7.43   39.2       3  0.190
##  5 10.5    64.7       5  0.162
##  6 13.5    36.8       3  0.368
##  7 23.3    83.1       4  0.280
##  8 18.0    86.9       1  0.207
##  9 14.2    88.2       2  0.160
## 10  8.82   38.7       2  0.228
## # ... with 28 more rows

ggplot(data = FoodExpenditure, mapping = aes(x = persons, y = p_food)) +
    geom_point() +
    scale_y_continuous(limits = c(0,1)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

ggplot(data = FoodExpenditure, mapping = aes(x = income, y = p_food)) +
    geom_point() +
    scale_y_continuous(limits = c(0,1)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
          legend.key = element_blank(),
          plot.title = element_text(hjust = 0.5),
          strip.background = element_blank())

Model specification

We will use the mean-precision parametrization of the beta distribution.

\[\begin{align*} f(y;\mu,\phi) &= \frac{\Gamma(\phi)}{\Gamma(\mu\phi)\Gamma((1-\mu)\phi)}y^{\mu\phi-1}(1-y)^{(1-\mu)\phi-1}, y \in (0,1) \end{align*} \]

The correspondence to the typical parametrization is: \(\alpha = \mu\phi\) and \(\beta = (1-\mu)\phi\). As a result the mean and variance are as follows.

\[\begin{align*} E[Y] &= \frac{\alpha}{\alpha+\beta}\\ &= \frac{\mu\phi}{\mu\phi + (1-\mu)\phi}\\ &= \frac{\mu\phi}{\phi}\\ &= \mu\\ \\ Var(Y) &= \frac{\alpha\beta}{(\alpha+\beta)^{2}(\alpha+\beta+1)}\\ &= \frac{\mu\phi(1-\mu)\phi}{\phi^{2}(\phi+1)}\\ &= \frac{\mu(1-\mu)}{\phi+1}\\ \end{align*} \]

These mean and precision parameters can be modeled as functions of covariates.

\[\begin{align*} g_{1}(\mu_{i}) &= \eta_{1i} = \mathbb{x}_{1i}^{T}\boldsymbol{\beta}_{x1}\\ g_{2}(\phi_{i}) &= \eta_{2i} = \mathbb{x}_{2i}^{T}\boldsymbol{\beta}_{x2}\\ \end{align*} \]

The sets of covariates \(\mathbb{x}_{1i}\) and \(\mathbb{x}_{2i}\) can differ.

stan_logit_log_beta_model_file <- "./beta_regression.stan"
biostan::print_stan_file(stan_logit_log_beta_model_file)

## data { 
##     /* Outcome data */ 
##     int<lower=0> N; 
##     real<lower=0,upper=1> y[N]; 
##     /* X1 matrix for the mean parameter. Include a constant column. */ 
##     int<lower=1> X1_dim; 
##     matrix[N, X1_dim] X1; 
##     real beta_x1_mean[X1_dim]; 
##     real<lower=0> beta_x1_sd[X1_dim]; 
##     /* X2 matrix for the precision parameter. Include a constant column. */ 
##     int<lower=1> X2_dim; 
##     matrix[N, X2_dim] X2; 
##     real beta_x2_mean[X2_dim]; 
##     real<lower=0> beta_x2_sd[X2_dim]; 
## } 
##  
## transformed data { 
##  
## } 
##  
## parameters { 
##     vector[X1_dim] beta_x1; 
##     vector[X2_dim] beta_x2; 
## } 
##  
## transformed parameters { 
##     vector[N] eta_x1 = X1 * beta_x1; 
##     vector[N] eta_x2 = X2 * beta_x2; 
##  
##     /* logit for mean. expit is the inverse. */ 
##     vector[N] mu = inv_logit(eta_x1); 
##     /* log for precision. exp is the inverse. */ 
##     vector[N] phi = exp(eta_x2); 
## } 
##  
## model { 
##  
##     /* Priors */ 
##     for (j in 1:X1_dim) { 
##         target += normal_lpdf(beta_x1[j] | beta_x1_mean[j], beta_x1_sd[j]); 
##     } 
##     for (k in 1:X2_dim) { 
##         target += normal_lpdf(beta_x2[k] | beta_x2_mean[k], beta_x2_sd[k]); 
##     } 
##  
##     /* Mean model */ 
##     for (i in 1:N) { 
##         target += beta_lpdf(y[i] | (mu .* phi)[i], ((1-mu) .* phi)[i]); 
##     } 
##  
## } 
##  
## generated quantities { 
##  
## } 
##

Model fit

## Prepare dataset
N <- nrow(FoodExpenditure)
y <- FoodExpenditure$p_food
##  Mean model part
X1 <- model.matrix(object = ~ income + persons, data = FoodExpenditure)
X1_dim <- ncol(X1)
beta_x1_mean <- rep(0, X1_dim)
beta_x1_sd <- c(10, rep(1, X1_dim-1))
##  Precision model part
X2 <- model.matrix(object = ~ income + persons, data = FoodExpenditure)
X2_dim <- ncol(X2)
beta_x2_mean <- rep(0, X2_dim)
beta_x2_sd <- c(10, rep(1, X2_dim-1))

stan_logit_log_beta_model_fit1 <-
    rstan::stan(file = stan_logit_log_beta_model_file,
                data = list(N = N,
                            y = y,
                            X1 = X1,
                            X1_dim = X1_dim,
                            beta_x1_mean = beta_x1_mean,
                            beta_x1_sd = beta_x1_sd,
                            X2 = X2,
                            X2_dim = X2_dim,
                            beta_x2_mean = beta_x2_mean,
                            beta_x2_sd = beta_x2_sd),
                chains = n_cores,
                cores = n_cores,
                verbose = TRUE)

MCMC diagnostics

traceplot(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2"), inc_warmup = TRUE)

pairs(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2","lp__"))

## More comprehensive and interactive
## shinystan::launch_shinystan(stan_logit_log_beta_model_fit1)

Model results

print(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2","lp__"))

## Inference for Stan model: beta_regression.
## 12 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=12000.
## 
##             mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## beta_x1[1] -0.76    0.00 0.20 -1.16 -0.89 -0.76 -0.63 -0.35  6454    1
## beta_x1[2] -0.01    0.00 0.00 -0.01 -0.01 -0.01 -0.01  0.00  6773    1
## beta_x1[3]  0.09    0.00 0.04  0.01  0.07  0.09  0.12  0.17  7357    1
## beta_x2[1]  4.87    0.01 1.00  2.82  4.22  4.90  5.55  6.74  5001    1
## beta_x2[2]  0.00    0.00 0.01 -0.02  0.00  0.01  0.01  0.03  6503    1
## beta_x2[3] -0.42    0.00 0.17 -0.75 -0.54 -0.43 -0.31 -0.09  6421    1
## lp__       35.75    0.03 1.81 31.29 34.79 36.08 37.09 38.24  4191    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Feb 12 07:19:02 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

plot(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2"))

Top Page: http://rpubs.com/kaz_yos/
Github: https://github.com/kaz-yos

print(sessionInfo())

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2     rstan_2.18.1       StanHeaders_2.18.0 biostan_0.1.0      forcats_0.3.0     
##  [6] stringr_1.3.1      dplyr_0.7.7        purrr_0.2.5        readr_1.1.1        tidyr_0.8.2       
## [11] tibble_1.4.2       ggplot2_3.1.0      tidyverse_1.2.1    doRNG_1.7.1        rngtools_1.3.1    
## [16] pkgmaker_0.27      registry_0.5       doParallel_1.0.14  iterators_1.0.10   foreach_1.4.4     
## [21] knitr_1.20        
## 
## loaded via a namespace (and not attached):
##  [1] minqa_1.2.4        colorspace_1.3-2   ggridges_0.5.1     rsconnect_0.8.8    rprojroot_1.3-2   
##  [6] markdown_0.8       base64enc_0.1-3    rstudioapi_0.8     DT_0.4             fansi_0.4.0       
## [11] lubridate_1.7.4    xml2_1.2.0         codetools_0.2-15   splines_3.5.1      shinythemes_1.1.1 
## [16] bayesplot_1.6.0    jsonlite_1.5       nloptr_1.2.1       broom_0.5.0        shiny_1.1.0       
## [21] compiler_3.5.1     httr_1.3.1         backports_1.1.2    assertthat_0.2.0   Matrix_1.2-14     
## [26] lazyeval_0.2.1     cli_1.0.1          later_0.7.5        htmltools_0.3.6    prettyunits_1.0.2 
## [31] tools_3.5.1        igraph_1.2.2       gtable_0.2.0       glue_1.3.0         reshape2_1.4.3    
## [36] Rcpp_0.12.19       cellranger_1.1.0   nlme_3.1-137       crosstalk_1.0.0    ps_1.2.0          
## [41] lme4_1.1-18-1      rvest_0.3.2        mime_0.6           miniUI_0.1.1.1     gtools_3.8.1      
## [46] MASS_7.3-51        zoo_1.8-4          scales_1.0.0       rstanarm_2.18.1    colourpicker_1.0  
## [51] hms_0.4.2          promises_1.0.1     inline_0.3.15      shinystan_2.5.0    yaml_2.2.0        
## [56] gridExtra_2.3      loo_2.0.0          stringi_1.2.4      dygraphs_1.1.1.6   pkgbuild_1.0.2    
## [61] bibtex_0.4.2       rlang_0.3.0.1      pkgconfig_2.0.2    matrixStats_0.54.0 evaluate_0.12     
## [66] lattice_0.20-35    bindr_0.1.1        splines2_0.2.8     labeling_0.3       rstantools_1.5.1  
## [71] htmlwidgets_1.3    processx_3.2.0     tidyselect_0.2.5   plyr_1.8.4         magrittr_1.5      
## [76] R6_2.3.0           pillar_1.3.0       haven_1.1.2        withr_2.1.2        xts_0.11-1        
## [81] survival_2.43-1    modelr_0.1.2       crayon_1.3.4       KernSmooth_2.23-15 utf8_1.1.4        
## [86] rmarkdown_1.10     grid_3.5.1         readxl_1.1.0       callr_3.0.0        threejs_0.3.1     
## [91] digest_0.6.18      xtable_1.8-3       httpuv_1.4.5       stats4_3.5.1       munsell_0.5.0     
## [96] shinyjs_1.0

## Record execution time and multicore use
end_time <- Sys.time()
diff_time <- difftime(end_time, start_time, units = "auto")
cat("Started  ", as.character(start_time), "\n",
    "Finished ", as.character(end_time), "\n",
    "Time difference of ", diff_time, " ", attr(diff_time, "units"), "\n",
    "Used ", foreach::getDoParWorkers(), " cores\n",
    "Used ", foreach::getDoParName(), " as backend\n",
    sep = "")

## Started  2019-02-12 07:17:55
## Finished 2019-02-12 07:19:23
## Time difference of 1.467137 mins
## Used 12 cores
## Used doParallelMC as backend