Matuura Salary Simple Linear Regression Example

References

Ch4. Matsuura. Bayesian Statistical Modeling with Stan and R.

Load packages

library(tidyverse)
library(rstan)
library(ggmcmc)

Create data

data1 <- tribble(
    ~X,~Y,
    24,472,
    24,403,
    26,454,
    32,575,
    33,546,
    35,781,
    38,750,
    40,601,
    40,814,
    43,792,
    43,745,
    44,837,
    48,868,
    52,988,
    56,1092,
    56,1007,
    57,1233,
    58,1202,
    59,1123,
    59,1314)

data1

## # A tibble: 20 x 2
##        X     Y
##    <dbl> <dbl>
##  1    24   472
##  2    24   403
##  3    26   454
##  4    32   575
##  5    33   546
##  6    35   781
##  7    38   750
##  8    40   601
##  9    40   814
## 10    43   792
## 11    43   745
## 12    44   837
## 13    48   868
## 14    52   988
## 15    56  1092
## 16    56  1007
## 17    57  1233
## 18    58  1202
## 19    59  1123
## 20    59  1314

Check data

ggplot(data = data1, mapping = aes(x = X, y = Y)) +
    geom_point() +
    geom_smooth(method = "lm") +
    theme_bw() + theme(legend.key = element_blank())

lm fit for sanity check

lm1 <- lm(formula = Y ~ X,
          data    = data1)
tableone::ShowRegTable(lm1, exp = FALSE)

##             coef [confint]           p     
## (Intercept) -119.70 [-262.87, 23.48]  0.096
## X             21.90 [18.71, 25.09]   <0.001

Stan fit

## Need to create a list to pass to Stan
data_list <- list(N = nrow(data1),
                  X = data1$X,
                  Y = data1$Y)

stan1 <- stan(file = "./salary.stan", data = data_list)

## In file included from file3d83bca6b7b.cpp:8:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/src/stan/model/model_header.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/mat.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/core.hpp:12:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/core/gevv_vvv_vari.hpp:5:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/core/var.hpp:7:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/math/tools/config.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/config.hpp:39:
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/config/compiler/clang.hpp:196:11: warning: 'BOOST_NO_CXX11_RVALUE_REFERENCES' macro redefined [-Wmacro-redefined]
## #  define BOOST_NO_CXX11_RVALUE_REFERENCES
##           ^
## <command line>:6:9: note: previous definition is here
## #define BOOST_NO_CXX11_RVALUE_REFERENCES 1
##         ^
## In file included from file3d83bca6b7b.cpp:8:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/src/stan/model/model_header.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/mat.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/core.hpp:42:
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/core/set_zero_all_adjoints.hpp:14:17: warning: unused function 'set_zero_all_adjoints' [-Wunused-function]
##     static void set_zero_all_adjoints() {
##                 ^
## In file included from file3d83bca6b7b.cpp:8:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/src/stan/model/model_header.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/mat.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/core.hpp:43:
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/core/set_zero_all_adjoints_nested.hpp:17:17: warning: 'static' function 'set_zero_all_adjoints_nested' declared in header file should be declared 'static inline' [-Wunneeded-internal-declaration]
##     static void set_zero_all_adjoints_nested() {
##                 ^
## In file included from file3d83bca6b7b.cpp:8:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/src/stan/model/model_header.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/mat.hpp:12:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/prim/mat.hpp:58:
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/autocorrelation.hpp:17:14: warning: function 'fft_next_good_size' is not needed and will not be emitted [-Wunneeded-internal-declaration]
##       size_t fft_next_good_size(size_t N) {
##              ^
## In file included from file3d83bca6b7b.cpp:8:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/src/stan/model/model_header.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math.hpp:4:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/rev/mat.hpp:12:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/prim/mat.hpp:298:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/prim/arr.hpp:38:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/StanHeaders/include/stan/math/prim/arr/functor/integrate_ode_rk45.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/numeric/odeint.hpp:61:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/numeric/odeint/util/multi_array_adaption.hpp:29:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/multi_array.hpp:21:
## In file included from /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/multi_array/base.hpp:28:
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/multi_array/concept_checks.hpp:42:43: warning: unused typedef 'index_range' [-Wunused-local-typedef]
##       typedef typename Array::index_range index_range;
##                                           ^
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/multi_array/concept_checks.hpp:43:37: warning: unused typedef 'index' [-Wunused-local-typedef]
##       typedef typename Array::index index;
##                                     ^
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/multi_array/concept_checks.hpp:53:43: warning: unused typedef 'index_range' [-Wunused-local-typedef]
##       typedef typename Array::index_range index_range;
##                                           ^
## /Library/Frameworks/R.framework/Versions/3.4/Resources/library/BH/include/boost/multi_array/concept_checks.hpp:54:37: warning: unused typedef 'index' [-Wunused-local-typedef]
##       typedef typename Array::index index;
##                                     ^
## 8 warnings generated.
## 
## SAMPLING FOR MODEL 'salary' NOW (CHAIN 1).
## 
## Gradient evaluation took 4.4e-05 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0.44 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:    1 / 2000 [  0%]  (Warmup)
## Iteration:  200 / 2000 [ 10%]  (Warmup)
## Iteration:  400 / 2000 [ 20%]  (Warmup)
## Iteration:  600 / 2000 [ 30%]  (Warmup)
## Iteration:  800 / 2000 [ 40%]  (Warmup)
## Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Iteration: 2000 / 2000 [100%]  (Sampling)
## 
##  Elapsed Time: 0.515523 seconds (Warm-up)
##                0.321766 seconds (Sampling)
##                0.837289 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'salary' NOW (CHAIN 2).
## 
## Gradient evaluation took 1.9e-05 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0.19 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:    1 / 2000 [  0%]  (Warmup)
## Iteration:  200 / 2000 [ 10%]  (Warmup)
## Iteration:  400 / 2000 [ 20%]  (Warmup)
## Iteration:  600 / 2000 [ 30%]  (Warmup)
## Iteration:  800 / 2000 [ 40%]  (Warmup)
## Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Iteration: 2000 / 2000 [100%]  (Sampling)
## 
##  Elapsed Time: 0.490234 seconds (Warm-up)
##                0.212108 seconds (Sampling)
##                0.702342 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'salary' NOW (CHAIN 3).
## 
## Gradient evaluation took 1.3e-05 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0.13 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:    1 / 2000 [  0%]  (Warmup)
## Iteration:  200 / 2000 [ 10%]  (Warmup)
## Iteration:  400 / 2000 [ 20%]  (Warmup)
## Iteration:  600 / 2000 [ 30%]  (Warmup)
## Iteration:  800 / 2000 [ 40%]  (Warmup)
## Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Iteration: 2000 / 2000 [100%]  (Sampling)
## 
##  Elapsed Time: 0.454587 seconds (Warm-up)
##                0.341104 seconds (Sampling)
##                0.795691 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'salary' NOW (CHAIN 4).
## 
## Gradient evaluation took 2.7e-05 seconds
## 1000 transitions using 10 leapfrog steps per transition would take 0.27 seconds.
## Adjust your expectations accordingly!
## 
## 
## Iteration:    1 / 2000 [  0%]  (Warmup)
## Iteration:  200 / 2000 [ 10%]  (Warmup)
## Iteration:  400 / 2000 [ 20%]  (Warmup)
## Iteration:  600 / 2000 [ 30%]  (Warmup)
## Iteration:  800 / 2000 [ 40%]  (Warmup)
## Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Iteration: 2000 / 2000 [100%]  (Sampling)
## 
##  Elapsed Time: 0.486255 seconds (Warm-up)
##                0.201203 seconds (Sampling)
##                0.687458 seconds (Total)

Model examination

## Model file
get_stanmodel(stan1)

## S4 class stanmodel 'salary' coded as follows:
## data {
##     int N;
##     real X[N];
##     real Y[N];
## }
## 
## 
## parameters {
## 
##     /* Parameters of interests */
##     real beta0;
##     real beta1;
##     /* Nuisance parameter */
##     real<lower=0> sigma;
## 
## }
## 
## 
## model {
## 
##     /* Likelihood */
##     for (i in 1:N) {
##         Y[i] ~ normal(beta0 + beta1 * X[i], sigma);
##     }
##     
##     /* Non-informative prior implied */
## }

## Computation time
get_elapsed_time(stan1)

##           warmup   sample
## chain:1 0.515523 0.321766
## chain:2 0.490234 0.212108
## chain:3 0.454587 0.341104
## chain:4 0.486255 0.201203

## Initial values for each chain
get_inits(stan1)

## [[1]]
## [[1]]$beta0
## [1] 1.230078
## 
## [[1]]$beta1
## [1] -1.826941
## 
## [[1]]$sigma
## [1] 1.172365
## 
## 
## [[2]]
## [[2]]$beta0
## [1] -1.925283
## 
## [[2]]$beta1
## [1] -0.4918617
## 
## [[2]]$sigma
## [1] 3.384842
## 
## 
## [[3]]
## [[3]]$beta0
## [1] -1.638182
## 
## [[3]]$beta1
## [1] -1.064468
## 
## [[3]]$sigma
## [1] 0.8219448
## 
## 
## [[4]]
## [[4]]$beta0
## [1] 1.331988
## 
## [[4]]$beta1
## [1] 1.829562
## 
## [[4]]$sigma
## [1] 2.862171

## Seed for each chain
get_seed(stan1)

## [1] 1970907798

## Results
stan1

## Inference for Stan model: salary.
## 4 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=4000.
## 
##          mean se_mean    sd    2.5%     25%     50%    75%  97.5% n_eff Rhat
## beta0 -118.04    1.97 72.75 -258.25 -165.23 -118.36 -72.16  26.49  1363    1
## beta1   21.86    0.04  1.62   18.59   20.84   21.87  22.89  24.99  1346    1
## sigma   84.53    0.39 15.88   60.32   73.48   82.33  92.68 121.54  1653    1
## lp__   -93.64    0.04  1.33  -97.10  -94.23  -93.28 -92.68 -92.14  1202    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jul  4 00:37:25 2017.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

## Summary
summary(stan1)

## $summary
##             mean    se_mean        sd       2.5%        25%        50%       75%     97.5%    n_eff     Rhat
## beta0 -118.03557 1.97028572 72.748035 -258.25213 -165.23260 -118.35904 -72.15879  26.49050 1363.277 1.000887
## beta1   21.85854 0.04411657  1.618466   18.58960   20.83599   21.86978  22.89211  24.99071 1345.872 1.000892
## sigma   84.52520 0.39050671 15.875316   60.32051   73.48387   82.32750  92.68133 121.54393 1652.676 1.002584
## lp__   -93.64000 0.03847493  1.333885  -97.09937  -94.22646  -93.28080 -92.67910 -92.13741 1201.935 1.001915
## 
## $c_summary
## , , chains = chain:1
## 
##          stats
## parameter       mean        sd       2.5%        25%        50%       75%     97.5%
##     beta0 -117.89551 71.565957 -251.71470 -165.08179 -119.44672 -70.82849  18.38938
##     beta1   21.83986  1.594733   18.70088   20.77708   21.86299  22.91317  24.99195
##     sigma   84.60532 15.794519   60.18423   73.72317   82.29529  92.75919 121.23665
##     lp__   -93.62147  1.345125  -97.01983  -94.20297  -93.26760 -92.65686 -92.12740
## 
## , , chains = chain:2
## 
##          stats
## parameter       mean        sd       2.5%        25%        50%       75%     97.5%
##     beta0 -118.04036 73.516152 -263.51088 -165.96921 -115.03016 -68.40623  19.50686
##     beta1   21.86239  1.642312   18.62426   20.83497   21.81397  22.89021  25.08177
##     sigma   86.22674 17.474431   60.57548   73.73674   83.48702  96.27961 125.36751
##     lp__   -93.75089  1.404648  -97.33940  -94.43465  -93.40912 -92.68607 -92.14880
## 
## , , chains = chain:3
## 
##          stats
## parameter       mean        sd       2.5%        25%        50%       75%     97.5%
##     beta0 -123.11789 70.417550 -265.14200 -165.85381 -125.70109 -77.55371  20.14472
##     beta1   21.97516  1.559524   18.73129   21.03883   22.06604  22.91334  25.05479
##     sigma   84.02194 14.550677   61.47518   74.07790   82.61394  92.16271 117.89610
##     lp__   -93.50731  1.225361  -96.76198  -93.94542  -93.21363 -92.64446 -92.12535
## 
## , , chains = chain:4
## 
##          stats
## parameter       mean        sd       2.5%        25%        50%       75%     97.5%
##     beta0 -113.08853 75.164377 -250.61689 -164.79349 -114.99048 -70.88472  53.12659
##     beta1   21.75676  1.669962   18.26603   20.75479   21.76324  22.84880  24.84600
##     sigma   83.24679 15.411707   59.16397   72.52632   81.18587  90.84440 119.26291
##     lp__   -93.68032  1.344070  -97.08792  -94.24617  -93.30376 -92.71408 -92.17411

## Plot mean and variability of posterior
plot(stan1)

## Trace
traceplot(stan1)

ggmcmc

## ggmcmc do them all
ggmcmc(ggs(stan1, inc_warmup = TRUE, stan_include_auxiliar = TRUE), file = NULL)

## Plotting histograms

## Plotting density plots

## Plotting traceplots

## Plotting running means

## Plotting comparison of partial and full chain

## Plotting autocorrelation plots

## Plotting crosscorrelation plot

## Plotting Potential Scale Reduction Factors

## Plotting Geweke Diagnostic

## Plotting caterpillar plot
## Time taken to generate the report: 40 seconds.

Top Page: http://rpubs.com/kaz_yos/
github: https://github.com/kaz-yos

Matuura Salary Simple Linear Regression Example

Kazuki Yoshida

2017-07-04

References

Load packages

Create data

Check data

lm fit for sanity check

Stan fit

Model examination

ggmcmc