1 Data

In this example weight of a person in lbs is predicted by height in inches.

We calculate the HDI using source the utilities file from (Kruschke 2015).

library(rjags)
library(rstan)
library(shinystan)
#options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
#Sys.setenv(LOCAL_CPPFLAGS = '-march=native') # avoid to run this line _-_ it was an old version
library("DT") #datatable

2 Simple regression model

\[ y = \beta_0 + \beta_1 x + \epsilon \]

With Bayesian approach distribution of \(\epsilon\) does not have to be Gaussian (normal), we are going to use robust assumption.

Parameterization of the model for MCMC (adapted from Kruschke (2015))

Bayes theorem for this model looks like this:

\[ p(\beta_0, \beta_1, \sigma, \gamma \mid D) = \frac{p(D \mid \beta_0, \beta_1, \sigma, \gamma) \ p(\beta_0, \beta_1, \sigma, \gamma)}{\int \int \int \int p(D \mid \beta_0, \beta_1, \sigma, \gamma) \ p(\beta_0, \beta_1, \sigma, \gamma) \ d\beta_0 \ d\beta_1 \ d\sigma \ d\gamma} \]

2.1 MCMC in JAGS

Read the data, create the data list.

myData = read.csv("HtWtData300.csv")
## =========== Column Filters
datatable(myData, 
          rownames = FALSE,
          filter = list(position = "top"))
plot(myData$height, myData$weight, ylab = "Weight (lbs)", xlab = "Height (inches)",
     main = "Scatter Plot between Height and Weight by Gender",
     pch = as.numeric(myData$male), col = as.factor(myData$male))
# Linear fit
abline(lm(myData$weight ~ myData$height), col = "orange", lwd = 3)

y <- myData$weight
x <- myData$height
# Ntotal = length(y)
dataList <- list(
    x = x ,
    y = y
)

# Specify the data in a list, for later shipment to JAGS:
# dataList = list(
#     y = y ,
#     x = x ,
#     Ntotal = Ntotal ,
#     meanY = mean(y) ,
#     sdY = sd(y)
# )
# dataList

Describe the model.
Based on the Normal distribution

modstring_norm = "
# Specify the Normal model for none-standardized data:
model {
    for (i in 1:Ntotal) {
        y[i] ~ dnorm(mu[i], prec)
        mu[i] = b[1] + b[2]*log_income[i] 
    }
    
    for (i in 1:2) {
        b[i] ~ dnorm(0.0, 1.0/1.0e6)
    }
    
    prec ~ dgamma(5/2.0, 5*10.0/2.0)
    sig2 = 1.0 / prec
    sig = sqrt(sig2)
} "

Based on the Student-t distribution

 modelString = "
# Standardize the data:
data {
    Ntotal <- length(y)
    xm <- mean(x)
    ym <- mean(y)
    xsd <- sd(x)
    ysd <- sd(y)
    for ( i in 1:length(y) ) {
      zx[i] <- (x[i] - xm) / xsd
      zy[i] <- (y[i] - ym) / ysd
    }
}
# Specify the model for standardized data:
model {
    for ( i in 1:Ntotal ) {
      zy[i] ~ dt( zbeta0 + zbeta1 * zx[i] , 1/zsigma^2 , nu )
    }
    # Priors vague on standardized scale:
    zbeta0 ~ dnorm(0, 1/(10)^2 )  
    zbeta1 ~ dnorm(0, 1/(10)^2 )
    zsigma ~ dunif(1.0E-3, 1.0E+3 )
    nu ~ dexp(1/30.0)
    # Transform to original scale:
    beta1 <- zbeta1 * ysd / xsd  
    beta0 <- zbeta0 * ysd  + ym - zbeta1 * xm * ysd / xsd 
    sigma <- zsigma * ysd
}
"
# Write out modelString to a text file
writeLines(modelString, con="TEMPmodel.txt")

Every arrow has a corresponding line in the description.
It is also possible putting description of data as

for (i in 1:Ntotal){
  zy[i] ~ dt( mu[i] , 1/zsigma^2 , nu )
  mu[i] <- zbeta0 + zbeta1 * zx[i]
}

That would allow recording mu[i] in MCMC.

Variable names starting with “z” mean that these variables are standardized (z-scores).

The intention of using z-scores in JAGS is to overcome a problem of correlation of the parameters (as the simulation the correlation between \(\beta_0\) and \(\beta_1\) in previous workshop).

Strong correlation creates thin and long shape on scatter-plot of the variables which makes Gibbs sampling very slow and inefficient.

This can be applied to STAN in all situation !!!

But remember to scale back to the original measures. HMC implemented in Stan does not have this problem.

parameters = c("beta0" ,  "beta1" ,  "sigma", 
              "zbeta0" , "zbeta1" , "zsigma", "nu")
adaptSteps = 500  # Number of steps to "tune" the samplers
burnInSteps = 1000
nChains = 4 
thinSteps = 1
numSavedSteps=20000
nIter = ceiling((numSavedSteps*thinSteps ) / nChains )
jagsModel = jags.model("TEMPmodel.txt", data=dataList ,
                      n.chains=nChains, n.adapt=adaptSteps)
## Compiling data graph
##    Resolving undeclared variables
##    Allocating nodes
##    Initializing
##    Reading data back into data table
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 300
##    Unobserved stochastic nodes: 4
##    Total graph size: 1495
## 
## Initializing model
update(jagsModel, n.iter=burnInSteps)
codaSamples = coda.samples(jagsModel, variable.names=parameters, 
                          n.iter=nIter, thin=thinSteps)

Explore the MCMC object.

library(HDInterval)
summary(codaSamples)
## 
## Iterations = 1501:6500
## Thinning interval = 1 
## Number of chains = 4 
## Sample size per chain = 5000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##             Mean       SD  Naive SE Time-series SE
## beta0  -140.0820 28.05969 0.1984119      0.2259339
## beta1     4.4632  0.41999 0.0029698      0.0032958
## nu        5.3706  1.61518 0.0114211      0.0227606
## sigma    23.9668  1.63862 0.0115868      0.0197453
## zbeta0   -0.0959  0.04682 0.0003311      0.0004033
## zbeta1    0.4907  0.04617 0.0003265      0.0003623
## zsigma    0.6834  0.04672 0.0003304      0.0005630
## 
## 2. Quantiles for each variable:
## 
##             2.5%       25%        50%       75%      97.5%
## beta0  -194.7328 -158.7522 -140.26981 -121.3214 -84.433158
## beta1     3.6320    4.1825    4.46516    4.7435   5.280274
## nu        3.1676    4.2537    5.06386    6.1421   9.332355
## sigma    20.8735   22.8469   23.93082   25.0605  27.265028
## zbeta0   -0.1876   -0.1274   -0.09644   -0.0649  -0.002535
## zbeta1    0.3993    0.4598    0.49090    0.5215   0.580513
## zsigma    0.5952    0.6514    0.68233    0.7145   0.777399
plot(codaSamples) # note: many graphs

autocorr.plot(codaSamples, ask=F)

effectiveSize(codaSamples)
##     beta0     beta1        nu     sigma    zbeta0    zbeta1    zsigma 
## 15443.787 16273.646  5161.462  6939.704 13536.225 16273.646  6939.704
#gelman.diag(codaSamples)
gelman.plot(codaSamples)   # lines may return error ==> Most likely reason: collinearity of parameters 

(HDIofChains<-lapply(codaSamples,function(z) cbind(Mu=hdi(codaSamples[[1]][,1]),Sd=hdi(codaSamples[[1]][,2]))))
## [[1]]
##             var1     var1
## lower -195.06250 3.651439
## upper  -86.61975 5.281509
## 
## [[2]]
##             var1     var1
## lower -195.06250 3.651439
## upper  -86.61975 5.281509
## 
## [[3]]
##             var1     var1
## lower -195.06250 3.651439
## upper  -86.61975 5.281509
## 
## [[4]]
##             var1     var1
## lower -195.06250 3.651439
## upper  -86.61975 5.281509

Look at strong correlation between beta0 and beta1 which slows Gibbs sampling down.

head(as.matrix(codaSamples[[1]]))
##          beta0    beta1       nu    sigma      zbeta0    zbeta1    zsigma
## [1,] -171.5638 4.914038 4.237061 22.29094 -0.14010801 0.5402488 0.6355744
## [2,] -149.7440 4.596812 3.409657 21.83242 -0.11849712 0.5053730 0.6225009
## [3,] -146.4599 4.525498 4.972364 22.20433 -0.15986180 0.4975328 0.6331048
## [4,] -175.3828 5.011857 6.615012 25.57258 -0.06381663 0.5510031 0.7291427
## [5,] -143.2086 4.510320 4.983787 21.87789 -0.09589158 0.4958641 0.6237973
## [6,] -141.2515 4.460113 4.590017 25.15170 -0.13513207 0.4903444 0.7171422
pairs(as.matrix(codaSamples[[1]])[,1:4])

2.2 MCMC in Stan

Describe the model in Stan.

In order to give a vague priors to slope and intercept consider the following arguments:

  1. The largest possible value of slope is

\[ \frac{\sigma_y}{\sigma_x} \]

when variables \(x\) and \(y\) are perfectly correlated.

Then standard deviation of the slope parameter \(\beta_1\) should be large enough to make the maximum value easily achievable.

  1. Size of intercept is defined by value of

\[ E[X] \frac{\sigma_y}{\sigma_x} \]

So, the prior should have enough width to include this value.

modelString = "
data {
    int<lower=1> Ntotal;
    real x[Ntotal];
    real y[Ntotal];
    real meanY;
    real sdY;
    real meanX;
    real sdX;
}
transformed data {
    real unifLo;
    real unifHi;
    real expLambda;
    real beta0sigma;
    real beta1sigma;
    unifLo = sdY/1000;
    unifHi = sdY*1000;
    expLambda = 1/30.0;
    beta1sigma = 10*fabs(sdY/sdX);
    beta0sigma = 10*(sdY^2+sdX^2)    / 10*fabs(meanX*sdY/sdX);
}
parameters {
    real beta0;
    real beta1;
    real<lower=0> nu; 
    real<lower=0> sigma; 
}
model {
    sigma ~ uniform(unifLo, unifHi); 
    nu ~ exponential(expLambda);
    beta0 ~ normal(0, beta0sigma);
    beta1 ~ normal(0, beta1sigma);
    for (i in 1:Ntotal) {
        y[i] ~ student_t(nu, beta0 + beta1 * x[i], sigma);
    }
}
"
stanDsoRobustReg <- stan_model(model_code=modelString) 
d <- read.csv("HtWtData300.csv")
dat<-list(Ntotal=length(d$weight),y=d$weight,meanY=mean(d$weight),
          sdY=sd(d$weight),x=d$height,meanX=mean(d$height),sdX=sd(d$height))
# 17_1: robust model to predict metric variable using metric predictor

# using Height-Weight data from Kruschke
# fit model
fitSimRegStan <- sampling(stanDsoRobustReg, 
             data=dat, 
             pars=c('beta0', 'beta1', 'nu', 'sigma'),
             iter=5000, chains = 4, cores = 4
)

Save the fitted object.

saveRDS(fitSimRegStan,file="fitSimRegStan.Rdata")
#fitSimRegStan <- readRDS("fitSimRegStan.Rdata")
print(fitSimRegStan)
## Inference for Stan model: 99bd7b261ff9240bf0c6d7b1b21831d6.
## 4 chains, each with iter=5000; warmup=2500; thin=1; 
## post-warmup draws per chain=2500, total post-warmup draws=10000.
## 
##           mean se_mean    sd     2.5%      25%      50%      75%    97.5% n_eff
## beta0  -139.46    0.47 27.76  -193.12  -158.29  -139.84  -120.86   -84.01  3424
## beta1     4.45    0.01  0.42     3.62     4.17     4.46     4.73     5.25  3444
## nu        5.38    0.03  1.67     3.16     4.24     5.05     6.10     9.51  3768
## sigma    23.95    0.03  1.63    20.86    22.82    23.91    25.04    27.21  3817
## lp__  -1264.99    0.03  1.44 -1268.61 -1265.70 -1264.67 -1263.93 -1263.18  3151
##       Rhat
## beta0    1
## beta1    1
## nu       1
## sigma    1
## lp__     1
## 
## Samples were drawn using NUTS(diag_e) at Wed Mar 24 23:11:06 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
plot(fitSimRegStan)
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)

rstan::traceplot(fitSimRegStan, ncol=1, inc_warmup=F)

pairs(fitSimRegStan, pars=c('nu','beta0','beta1','sigma'))

stan_scat(fitSimRegStan, c('beta0','beta1'))

stan_scat(fitSimRegStan, c('beta1','sigma'))

stan_scat(fitSimRegStan, c('beta0','sigma'))

stan_scat(fitSimRegStan, c('nu','sigma'))

stan_dens(fitSimRegStan)

stan_ac(fitSimRegStan, separate_chains = T)

stan_diag(fitSimRegStan,information = "sample",chain=0)
## Warning: Removed 2 rows containing missing values (geom_bar).

stan_diag(fitSimRegStan,information = "stepsize",chain = 0)

stan_diag(fitSimRegStan,information = "treedepth",chain = 0)

stan_diag(fitSimRegStan,information = "divergence",chain = 0)

Work with shinystan object.

launch_shinystan(fitSimRegStan)
## 
## Launching ShinyStan interface... for large models this  may take some time.
## 
## Listening on http://127.0.0.1:3894

3 Using fitted regression model for prediction

Recall that the data in this example contains predictor height and output weight for a group of people from HtWtData300.csv (data above).

Plot all heights observed in the sample and check the summary of the variable.

plot(1:length(dat$x),dat$x)

summary(dat$x)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   54.60   64.00   66.20   66.39   69.20   76.00

Can we predict weight of a person who is 50 or 80 inches tall?

To do this we can go through all pairs of simulated parameters (\(\beta_0\), \(\beta_1\)) and use them to simulate \(y(50)\) and \(y(80)\).
This gives distribution of predicted values.

summary(fitSimRegStan)
## $summary
##               mean     se_mean         sd         2.5%          25%
## beta0  -139.457716 0.474473592 27.7647845  -193.116667  -158.286756
## beta1     4.453972 0.007078577  0.4153911     3.624519     4.173722
## nu        5.376267 0.027127112  1.6652200     3.156314     4.244959
## sigma    23.947109 0.026455221  1.6344348    20.863321    22.817115
## lp__  -1264.989875 0.025691887  1.4422166 -1268.608443 -1265.699675
##                50%          75%        97.5%    n_eff     Rhat
## beta0  -139.844025  -120.857894   -84.010420 3424.243 1.000925
## beta1     4.457688     4.734091     5.252953 3443.678 1.000856
## nu        5.047634     6.095892     9.512076 3768.219 1.000766
## sigma    23.913962    25.040586    27.212601 3816.914 1.002366
## lp__  -1264.665599 -1263.931458 -1263.175565 3151.149 0.999746
## 
## $c_summary
## , , chains = chain:1
## 
##          stats
## parameter         mean         sd         2.5%          25%          50%
##     beta0  -139.555597 27.4597006  -192.105111  -157.412180  -140.500612
##     beta1     4.455016  0.4111828     3.635894     4.183359     4.465408
##     nu        5.348082  1.5971432     3.218502     4.245957     5.016586
##     sigma    23.975451  1.6225573    21.023659    22.797586    23.973171
##     lp__  -1264.975332  1.3863499 -1268.395668 -1265.683961 -1264.696240
##          stats
## parameter          75%        97.5%
##     beta0  -121.592411   -84.278087
##     beta1     4.724725     5.244700
##     nu        6.074922     9.284072
##     sigma    25.072882    27.267627
##     lp__  -1263.948771 -1263.188271
## 
## , , chains = chain:2
## 
##          stats
## parameter         mean         sd         2.5%          25%          50%
##     beta0  -137.947426 28.2430234  -193.077997  -157.144437  -138.247678
##     beta1     4.432032  0.4223973     3.595172     4.141317     4.430180
##     nu        5.474135  1.6888089     3.180908     4.313370     5.167747
##     sigma    24.082420  1.6304548    20.919179    23.016345    24.088228
##     lp__  -1264.997107  1.4349376 -1268.432280 -1265.703064 -1264.676008
##          stats
## parameter          75%        97.5%
##     beta0  -118.686252   -81.290100
##     beta1     4.719801     5.254270
##     nu        6.223180     9.496769
##     sigma    25.173914    27.206210
##     lp__  -1263.940853 -1263.208054
## 
## , , chains = chain:3
## 
##          stats
## parameter         mean         sd         2.5%          25%          50%
##     beta0  -141.128865 27.0739862  -193.567806  -159.928138  -141.184528
##     beta1     4.478245  0.4055515     3.693086     4.198271     4.477226
##     nu        5.287256  1.6590084     3.088694     4.172013     4.970867
##     sigma    23.785158  1.6439336    20.582666    22.707214    23.716970
##     lp__  -1265.001908  1.4557212 -1268.750852 -1265.710011 -1264.661592
##          stats
## parameter          75%        97.5%
##     beta0  -122.469971   -88.959180
##     beta1     4.754764     5.260620
##     nu        5.990676     9.458371
##     sigma    24.838777    27.132066
##     lp__  -1263.917762 -1263.156629
## 
## , , chains = chain:4
## 
##          stats
## parameter         mean         sd         2.5%          25%          50%
##     beta0  -139.198975 28.1886842  -193.066614  -158.635320  -139.280035
##     beta1     4.450594  0.4211414     3.612505     4.167589     4.450718
##     nu        5.395597  1.7091611     3.175277     4.234994     5.048846
##     sigma    23.945407  1.6278086    20.993270    22.825139    23.877884
##     lp__  -1264.985151  1.4905988 -1268.896910 -1265.699569 -1264.617903
##          stats
## parameter          75%        97.5%
##     beta0  -120.442022   -82.393654
##     beta1     4.736709     5.249889
##     nu        6.082440     9.747841
##     sigma    25.032312    27.232596
##     lp__  -1263.896917 -1263.168898
regParam<-cbind(Beta0=rstan::extract(fitSimRegStan,pars="beta0")$'beta0',
                Beta1=rstan::extract(fitSimRegStan,pars="beta1")$'beta1')
head(regParam)
##          Beta0    Beta1
## [1,] -116.0424 4.086300
## [2,] -161.1032 4.779901
## [3,] -158.2478 4.723801
## [4,] -165.5372 4.869262
## [5,] -134.6192 4.379242
## [6,] -153.0722 4.636497
predX50<-apply(regParam,1,function(z) z%*%c(1,50))
predX80<-apply(regParam,1,function(z) z%*%c(1,80))

Plot both distributions, look at their summaries and HDIs.

suppressWarnings(library(HDInterval))
den<-density(predX50)
plot(density(predX80),xlim=c(60,240))
lines(den$x,den$y)

summary(cbind(predX50,predX80))
##     predX50          predX80     
##  Min.   : 60.45   Min.   :194.9  
##  1st Qu.: 78.41   1st Qu.:212.9  
##  Median : 83.16   Median :216.9  
##  Mean   : 83.24   Mean   :216.9  
##  3rd Qu.: 88.06   3rd Qu.:220.7  
##  Max.   :111.60   Max.   :236.1
rbind(predX50=hdi(predX50),predX80=hdi(predX80))
##             lower    upper
## predX50  69.20181  97.1803
## predX80 205.39034 227.8176

4 Hierarchical regression on individuals within groups

4.1 Simulated data from the book

4.1.1 Data

4.1.2 Model description

4.1.3 First run: default parameters

4.1.4 Trying to improve convergence by altering step size

4.1.5 Gaussian model

4.2 Example from Linear and Nonlinear Models

References

Kruschke, John K. 2015. Doing Bayesian Data Analysis : A Tutorial with r, JAGS, and Stan. Book. 2E [edition]. Amsterdam: Academic Press is an imprint of Elsevier.