library(tidyverse)
## https://github.com/jburos/biostan
## devtools::install_github('jburos/biostan', build_vignettes=TRUE, dependencies=TRUE)
library(biostan)
## Warning: replacing previous import 'rstan::loo' by 'rstanarm::loo' when loading 'biostan'
library(rstan)
FoodExpenditure package:betareg R Documentation
Proportion of Household Income Spent on Food
Description:
Data on proportion of income spent on food for a random sample of
38 households in a large US city.
Usage:
data("FoodExpenditure")
Format:
A data frame containing 38 observations on 3 variables.
food household expenditures for food.
income household income.
persons number of persons living in household.
Source:
Taken from Griffiths et al. (1993, Table 15.4).
data(FoodExpenditure, package = "betareg")
FoodExpenditure <-
FoodExpenditure %>%
as_tibble() %>%
mutate(p_food = food / income)
FoodExpenditure
## # A tibble: 38 x 4
## food income persons p_food
## <dbl> <dbl> <int> <dbl>
## 1 16.0 62.5 1 0.256
## 2 16.7 82.3 5 0.202
## 3 21.7 74.7 3 0.291
## 4 7.43 39.2 3 0.190
## 5 10.5 64.7 5 0.162
## 6 13.5 36.8 3 0.368
## 7 23.3 83.1 4 0.280
## 8 18.0 86.9 1 0.207
## 9 14.2 88.2 2 0.160
## 10 8.82 38.7 2 0.228
## # ... with 28 more rows
ggplot(data = FoodExpenditure, mapping = aes(x = persons, y = p_food)) +
geom_point() +
scale_y_continuous(limits = c(0,1)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
legend.key = element_blank(),
plot.title = element_text(hjust = 0.5),
strip.background = element_blank())
ggplot(data = FoodExpenditure, mapping = aes(x = income, y = p_food)) +
geom_point() +
scale_y_continuous(limits = c(0,1)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
legend.key = element_blank(),
plot.title = element_text(hjust = 0.5),
strip.background = element_blank())
We will use the mean-precision parametrization of the beta distribution.
\[\begin{align*} f(y;\mu,\phi) &= \frac{\Gamma(\phi)}{\Gamma(\mu\phi)\Gamma((1-\mu)\phi)}y^{\mu\phi-1}(1-y)^{(1-\mu)\phi-1}, y \in (0,1) \end{align*} \]
The correspondence to the typical parametrization is: \(\alpha = \mu\phi\) and \(\beta = (1-\mu)\phi\). As a result the mean and variance are as follows.
\[\begin{align*} E[Y] &= \frac{\alpha}{\alpha+\beta}\\ &= \frac{\mu\phi}{\mu\phi + (1-\mu)\phi}\\ &= \frac{\mu\phi}{\phi}\\ &= \mu\\ \\ Var(Y) &= \frac{\alpha\beta}{(\alpha+\beta)^{2}(\alpha+\beta+1)}\\ &= \frac{\mu\phi(1-\mu)\phi}{\phi^{2}(\phi+1)}\\ &= \frac{\mu(1-\mu)}{\phi+1}\\ \end{align*} \]
These mean and precision parameters can be modeled as functions of covariates.
\[\begin{align*} g_{1}(\mu_{i}) &= \eta_{1i} = \mathbb{x}_{1i}^{T}\boldsymbol{\beta}_{x1}\\ g_{2}(\phi_{i}) &= \eta_{2i} = \mathbb{x}_{2i}^{T}\boldsymbol{\beta}_{x2}\\ \end{align*} \]
The sets of covariates \(\mathbb{x}_{1i}\) and \(\mathbb{x}_{2i}\) can differ.
stan_logit_log_beta_model_file <- "./beta_regression.stan"
biostan::print_stan_file(stan_logit_log_beta_model_file)
## data {
## /* Outcome data */
## int<lower=0> N;
## real<lower=0,upper=1> y[N];
## /* X1 matrix for the mean parameter. Include a constant column. */
## int<lower=1> X1_dim;
## matrix[N, X1_dim] X1;
## real beta_x1_mean[X1_dim];
## real<lower=0> beta_x1_sd[X1_dim];
## /* X2 matrix for the precision parameter. Include a constant column. */
## int<lower=1> X2_dim;
## matrix[N, X2_dim] X2;
## real beta_x2_mean[X2_dim];
## real<lower=0> beta_x2_sd[X2_dim];
## }
##
## transformed data {
##
## }
##
## parameters {
## vector[X1_dim] beta_x1;
## vector[X2_dim] beta_x2;
## }
##
## transformed parameters {
## vector[N] eta_x1 = X1 * beta_x1;
## vector[N] eta_x2 = X2 * beta_x2;
##
## /* logit for mean. expit is the inverse. */
## vector[N] mu = inv_logit(eta_x1);
## /* log for precision. exp is the inverse. */
## vector[N] phi = exp(eta_x2);
## }
##
## model {
##
## /* Priors */
## for (j in 1:X1_dim) {
## target += normal_lpdf(beta_x1[j] | beta_x1_mean[j], beta_x1_sd[j]);
## }
## for (k in 1:X2_dim) {
## target += normal_lpdf(beta_x2[k] | beta_x2_mean[k], beta_x2_sd[k]);
## }
##
## /* Mean model */
## for (i in 1:N) {
## target += beta_lpdf(y[i] | (mu .* phi)[i], ((1-mu) .* phi)[i]);
## }
##
## }
##
## generated quantities {
##
## }
##
## Prepare dataset
N <- nrow(FoodExpenditure)
y <- FoodExpenditure$p_food
## Mean model part
X1 <- model.matrix(object = ~ income + persons, data = FoodExpenditure)
X1_dim <- ncol(X1)
beta_x1_mean <- rep(0, X1_dim)
beta_x1_sd <- c(10, rep(1, X1_dim-1))
## Precision model part
X2 <- model.matrix(object = ~ income + persons, data = FoodExpenditure)
X2_dim <- ncol(X2)
beta_x2_mean <- rep(0, X2_dim)
beta_x2_sd <- c(10, rep(1, X2_dim-1))
stan_logit_log_beta_model_fit1 <-
rstan::stan(file = stan_logit_log_beta_model_file,
data = list(N = N,
y = y,
X1 = X1,
X1_dim = X1_dim,
beta_x1_mean = beta_x1_mean,
beta_x1_sd = beta_x1_sd,
X2 = X2,
X2_dim = X2_dim,
beta_x2_mean = beta_x2_mean,
beta_x2_sd = beta_x2_sd),
chains = n_cores,
cores = n_cores,
verbose = TRUE)
traceplot(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2"), inc_warmup = TRUE)
pairs(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2","lp__"))
## More comprehensive and interactive
## shinystan::launch_shinystan(stan_logit_log_beta_model_fit1)
print(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2","lp__"))
## Inference for Stan model: beta_regression.
## 12 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=12000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta_x1[1] -0.76 0.00 0.20 -1.16 -0.89 -0.76 -0.63 -0.35 6454 1
## beta_x1[2] -0.01 0.00 0.00 -0.01 -0.01 -0.01 -0.01 0.00 6773 1
## beta_x1[3] 0.09 0.00 0.04 0.01 0.07 0.09 0.12 0.17 7357 1
## beta_x2[1] 4.87 0.01 1.00 2.82 4.22 4.90 5.55 6.74 5001 1
## beta_x2[2] 0.00 0.00 0.01 -0.02 0.00 0.01 0.01 0.03 6503 1
## beta_x2[3] -0.42 0.00 0.17 -0.75 -0.54 -0.43 -0.31 -0.09 6421 1
## lp__ 35.75 0.03 1.81 31.29 34.79 36.08 37.09 38.24 4191 1
##
## Samples were drawn using NUTS(diag_e) at Tue Feb 12 07:19:02 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
plot(stan_logit_log_beta_model_fit1, pars = c("beta_x1","beta_x2"))
print(sessionInfo())
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 rstan_2.18.1 StanHeaders_2.18.0 biostan_0.1.0 forcats_0.3.0
## [6] stringr_1.3.1 dplyr_0.7.7 purrr_0.2.5 readr_1.1.1 tidyr_0.8.2
## [11] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1 doRNG_1.7.1 rngtools_1.3.1
## [16] pkgmaker_0.27 registry_0.5 doParallel_1.0.14 iterators_1.0.10 foreach_1.4.4
## [21] knitr_1.20
##
## loaded via a namespace (and not attached):
## [1] minqa_1.2.4 colorspace_1.3-2 ggridges_0.5.1 rsconnect_0.8.8 rprojroot_1.3-2
## [6] markdown_0.8 base64enc_0.1-3 rstudioapi_0.8 DT_0.4 fansi_0.4.0
## [11] lubridate_1.7.4 xml2_1.2.0 codetools_0.2-15 splines_3.5.1 shinythemes_1.1.1
## [16] bayesplot_1.6.0 jsonlite_1.5 nloptr_1.2.1 broom_0.5.0 shiny_1.1.0
## [21] compiler_3.5.1 httr_1.3.1 backports_1.1.2 assertthat_0.2.0 Matrix_1.2-14
## [26] lazyeval_0.2.1 cli_1.0.1 later_0.7.5 htmltools_0.3.6 prettyunits_1.0.2
## [31] tools_3.5.1 igraph_1.2.2 gtable_0.2.0 glue_1.3.0 reshape2_1.4.3
## [36] Rcpp_0.12.19 cellranger_1.1.0 nlme_3.1-137 crosstalk_1.0.0 ps_1.2.0
## [41] lme4_1.1-18-1 rvest_0.3.2 mime_0.6 miniUI_0.1.1.1 gtools_3.8.1
## [46] MASS_7.3-51 zoo_1.8-4 scales_1.0.0 rstanarm_2.18.1 colourpicker_1.0
## [51] hms_0.4.2 promises_1.0.1 inline_0.3.15 shinystan_2.5.0 yaml_2.2.0
## [56] gridExtra_2.3 loo_2.0.0 stringi_1.2.4 dygraphs_1.1.1.6 pkgbuild_1.0.2
## [61] bibtex_0.4.2 rlang_0.3.0.1 pkgconfig_2.0.2 matrixStats_0.54.0 evaluate_0.12
## [66] lattice_0.20-35 bindr_0.1.1 splines2_0.2.8 labeling_0.3 rstantools_1.5.1
## [71] htmlwidgets_1.3 processx_3.2.0 tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
## [76] R6_2.3.0 pillar_1.3.0 haven_1.1.2 withr_2.1.2 xts_0.11-1
## [81] survival_2.43-1 modelr_0.1.2 crayon_1.3.4 KernSmooth_2.23-15 utf8_1.1.4
## [86] rmarkdown_1.10 grid_3.5.1 readxl_1.1.0 callr_3.0.0 threejs_0.3.1
## [91] digest_0.6.18 xtable_1.8-3 httpuv_1.4.5 stats4_3.5.1 munsell_0.5.0
## [96] shinyjs_1.0
## Record execution time and multicore use
end_time <- Sys.time()
diff_time <- difftime(end_time, start_time, units = "auto")
cat("Started ", as.character(start_time), "\n",
"Finished ", as.character(end_time), "\n",
"Time difference of ", diff_time, " ", attr(diff_time, "units"), "\n",
"Used ", foreach::getDoParWorkers(), " cores\n",
"Used ", foreach::getDoParName(), " as backend\n",
sep = "")
## Started 2019-02-12 07:17:55
## Finished 2019-02-12 07:19:23
## Time difference of 1.467137 mins
## Used 12 cores
## Used doParallelMC as backend