In this example weight of a person in lbs is predicted by height in inches.
We calculate the HDI using source the utilities file from (Kruschke 2015).
library(rjags)
library(rstan)
library(shinystan)
#options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
#Sys.setenv(LOCAL_CPPFLAGS = '-march=native') # avoid to run this line _-_ it was an old version
library("DT") #datatable
\[ y = \beta_0 + \beta_1 x + \epsilon \]
With Bayesian approach distribution of \(\epsilon\) does not have to be Gaussian (normal), we are going to use robust assumption.
Parameterization of the model for MCMC (adapted from Kruschke (2015))
Bayes theorem for this model looks like this:
\[ p(\beta_0, \beta_1, \sigma, \gamma \mid D) = \frac{p(D \mid \beta_0, \beta_1, \sigma, \gamma) \ p(\beta_0, \beta_1, \sigma, \gamma)}{\int \int \int \int p(D \mid \beta_0, \beta_1, \sigma, \gamma) \ p(\beta_0, \beta_1, \sigma, \gamma) \ d\beta_0 \ d\beta_1 \ d\sigma \ d\gamma} \]
Read the data, create the data list.
myData = read.csv("HtWtData300.csv")
## =========== Column Filters
datatable(myData,
rownames = FALSE,
filter = list(position = "top"))
plot(myData$height, myData$weight, ylab = "Weight (lbs)", xlab = "Height (inches)",
main = "Scatter Plot between Height and Weight by Gender",
pch = as.numeric(myData$male), col = as.factor(myData$male))
# Linear fit
abline(lm(myData$weight ~ myData$height), col = "orange", lwd = 3)
y <- myData$weight
x <- myData$height
# Ntotal = length(y)
dataList <- list(
x = x ,
y = y
)
# Specify the data in a list, for later shipment to JAGS:
# dataList = list(
# y = y ,
# x = x ,
# Ntotal = Ntotal ,
# meanY = mean(y) ,
# sdY = sd(y)
# )
# dataList
Describe the model.
Based on the Normal distribution
modstring_norm = "
# Specify the Normal model for none-standardized data:
model {
for (i in 1:Ntotal) {
y[i] ~ dnorm(mu[i], prec)
mu[i] = b[1] + b[2]*log_income[i]
}
for (i in 1:2) {
b[i] ~ dnorm(0.0, 1.0/1.0e6)
}
prec ~ dgamma(5/2.0, 5*10.0/2.0)
sig2 = 1.0 / prec
sig = sqrt(sig2)
} "
Based on the Student-t distribution
modelString = "
# Standardize the data:
data {
Ntotal <- length(y)
xm <- mean(x)
ym <- mean(y)
xsd <- sd(x)
ysd <- sd(y)
for ( i in 1:length(y) ) {
zx[i] <- (x[i] - xm) / xsd
zy[i] <- (y[i] - ym) / ysd
}
}
# Specify the model for standardized data:
model {
for ( i in 1:Ntotal ) {
zy[i] ~ dt( zbeta0 + zbeta1 * zx[i] , 1/zsigma^2 , nu )
}
# Priors vague on standardized scale:
zbeta0 ~ dnorm(0, 1/(10)^2 )
zbeta1 ~ dnorm(0, 1/(10)^2 )
zsigma ~ dunif(1.0E-3, 1.0E+3 )
nu ~ dexp(1/30.0)
# Transform to original scale:
beta1 <- zbeta1 * ysd / xsd
beta0 <- zbeta0 * ysd + ym - zbeta1 * xm * ysd / xsd
sigma <- zsigma * ysd
}
"
# Write out modelString to a text file
writeLines(modelString, con="TEMPmodel.txt")
Every arrow has a corresponding line in the description.
It is also possible putting description of data as
for (i in 1:Ntotal){
zy[i] ~ dt( mu[i] , 1/zsigma^2 , nu )
mu[i] <- zbeta0 + zbeta1 * zx[i]
}
That would allow recording mu[i] in MCMC.
Variable names starting with “z” mean that these variables are standardized (z-scores).
The intention of using z-scores in JAGS is to overcome a problem of correlation of the parameters (as the simulation the correlation between \(\beta_0\) and \(\beta_1\) in previous workshop).
Strong correlation creates thin and long shape on scatter-plot of the variables which makes Gibbs sampling very slow and inefficient.
This can be applied to STAN in all situation !!!
But remember to scale back to the original measures. HMC implemented in Stan does not have this problem.
parameters = c("beta0" , "beta1" , "sigma",
"zbeta0" , "zbeta1" , "zsigma", "nu")
adaptSteps = 500 # Number of steps to "tune" the samplers
burnInSteps = 1000
nChains = 4
thinSteps = 1
numSavedSteps=20000
nIter = ceiling((numSavedSteps*thinSteps ) / nChains )
jagsModel = jags.model("TEMPmodel.txt", data=dataList ,
n.chains=nChains, n.adapt=adaptSteps)
## Compiling data graph
## Resolving undeclared variables
## Allocating nodes
## Initializing
## Reading data back into data table
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 300
## Unobserved stochastic nodes: 4
## Total graph size: 1495
##
## Initializing model
update(jagsModel, n.iter=burnInSteps)
codaSamples = coda.samples(jagsModel, variable.names=parameters,
n.iter=nIter, thin=thinSteps)
Explore the MCMC object.
library(HDInterval)
summary(codaSamples)
##
## Iterations = 1501:6500
## Thinning interval = 1
## Number of chains = 4
## Sample size per chain = 5000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## beta0 -140.0820 28.05969 0.1984119 0.2259339
## beta1 4.4632 0.41999 0.0029698 0.0032958
## nu 5.3706 1.61518 0.0114211 0.0227606
## sigma 23.9668 1.63862 0.0115868 0.0197453
## zbeta0 -0.0959 0.04682 0.0003311 0.0004033
## zbeta1 0.4907 0.04617 0.0003265 0.0003623
## zsigma 0.6834 0.04672 0.0003304 0.0005630
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## beta0 -194.7328 -158.7522 -140.26981 -121.3214 -84.433158
## beta1 3.6320 4.1825 4.46516 4.7435 5.280274
## nu 3.1676 4.2537 5.06386 6.1421 9.332355
## sigma 20.8735 22.8469 23.93082 25.0605 27.265028
## zbeta0 -0.1876 -0.1274 -0.09644 -0.0649 -0.002535
## zbeta1 0.3993 0.4598 0.49090 0.5215 0.580513
## zsigma 0.5952 0.6514 0.68233 0.7145 0.777399
plot(codaSamples) # note: many graphs
autocorr.plot(codaSamples, ask=F)
effectiveSize(codaSamples)
## beta0 beta1 nu sigma zbeta0 zbeta1 zsigma
## 15443.787 16273.646 5161.462 6939.704 13536.225 16273.646 6939.704
#gelman.diag(codaSamples)
gelman.plot(codaSamples) # lines may return error ==> Most likely reason: collinearity of parameters
(HDIofChains<-lapply(codaSamples,function(z) cbind(Mu=hdi(codaSamples[[1]][,1]),Sd=hdi(codaSamples[[1]][,2]))))
## [[1]]
## var1 var1
## lower -195.06250 3.651439
## upper -86.61975 5.281509
##
## [[2]]
## var1 var1
## lower -195.06250 3.651439
## upper -86.61975 5.281509
##
## [[3]]
## var1 var1
## lower -195.06250 3.651439
## upper -86.61975 5.281509
##
## [[4]]
## var1 var1
## lower -195.06250 3.651439
## upper -86.61975 5.281509
Look at strong correlation between beta0 and beta1 which slows Gibbs sampling down.
head(as.matrix(codaSamples[[1]]))
## beta0 beta1 nu sigma zbeta0 zbeta1 zsigma
## [1,] -171.5638 4.914038 4.237061 22.29094 -0.14010801 0.5402488 0.6355744
## [2,] -149.7440 4.596812 3.409657 21.83242 -0.11849712 0.5053730 0.6225009
## [3,] -146.4599 4.525498 4.972364 22.20433 -0.15986180 0.4975328 0.6331048
## [4,] -175.3828 5.011857 6.615012 25.57258 -0.06381663 0.5510031 0.7291427
## [5,] -143.2086 4.510320 4.983787 21.87789 -0.09589158 0.4958641 0.6237973
## [6,] -141.2515 4.460113 4.590017 25.15170 -0.13513207 0.4903444 0.7171422
pairs(as.matrix(codaSamples[[1]])[,1:4])
Describe the model in Stan.
In order to give a vague priors to slope and intercept consider the following arguments:
\[ \frac{\sigma_y}{\sigma_x} \]
when variables \(x\) and \(y\) are perfectly correlated.
Then standard deviation of the slope parameter \(\beta_1\) should be large enough to make the maximum value easily achievable.
\[ E[X] \frac{\sigma_y}{\sigma_x} \]
So, the prior should have enough width to include this value.
modelString = "
data {
int<lower=1> Ntotal;
real x[Ntotal];
real y[Ntotal];
real meanY;
real sdY;
real meanX;
real sdX;
}
transformed data {
real unifLo;
real unifHi;
real expLambda;
real beta0sigma;
real beta1sigma;
unifLo = sdY/1000;
unifHi = sdY*1000;
expLambda = 1/30.0;
beta1sigma = 10*fabs(sdY/sdX);
beta0sigma = 10*(sdY^2+sdX^2) / 10*fabs(meanX*sdY/sdX);
}
parameters {
real beta0;
real beta1;
real<lower=0> nu;
real<lower=0> sigma;
}
model {
sigma ~ uniform(unifLo, unifHi);
nu ~ exponential(expLambda);
beta0 ~ normal(0, beta0sigma);
beta1 ~ normal(0, beta1sigma);
for (i in 1:Ntotal) {
y[i] ~ student_t(nu, beta0 + beta1 * x[i], sigma);
}
}
"
stanDsoRobustReg <- stan_model(model_code=modelString)
d <- read.csv("HtWtData300.csv")
dat<-list(Ntotal=length(d$weight),y=d$weight,meanY=mean(d$weight),
sdY=sd(d$weight),x=d$height,meanX=mean(d$height),sdX=sd(d$height))
# 17_1: robust model to predict metric variable using metric predictor
# using Height-Weight data from Kruschke
# fit model
fitSimRegStan <- sampling(stanDsoRobustReg,
data=dat,
pars=c('beta0', 'beta1', 'nu', 'sigma'),
iter=5000, chains = 4, cores = 4
)
Save the fitted object.
saveRDS(fitSimRegStan,file="fitSimRegStan.Rdata")
#fitSimRegStan <- readRDS("fitSimRegStan.Rdata")
print(fitSimRegStan)
## Inference for Stan model: 99bd7b261ff9240bf0c6d7b1b21831d6.
## 4 chains, each with iter=5000; warmup=2500; thin=1;
## post-warmup draws per chain=2500, total post-warmup draws=10000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
## beta0 -139.46 0.47 27.76 -193.12 -158.29 -139.84 -120.86 -84.01 3424
## beta1 4.45 0.01 0.42 3.62 4.17 4.46 4.73 5.25 3444
## nu 5.38 0.03 1.67 3.16 4.24 5.05 6.10 9.51 3768
## sigma 23.95 0.03 1.63 20.86 22.82 23.91 25.04 27.21 3817
## lp__ -1264.99 0.03 1.44 -1268.61 -1265.70 -1264.67 -1263.93 -1263.18 3151
## Rhat
## beta0 1
## beta1 1
## nu 1
## sigma 1
## lp__ 1
##
## Samples were drawn using NUTS(diag_e) at Wed Mar 24 23:11:06 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
plot(fitSimRegStan)
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
rstan::traceplot(fitSimRegStan, ncol=1, inc_warmup=F)
pairs(fitSimRegStan, pars=c('nu','beta0','beta1','sigma'))
stan_scat(fitSimRegStan, c('beta0','beta1'))
stan_scat(fitSimRegStan, c('beta1','sigma'))
stan_scat(fitSimRegStan, c('beta0','sigma'))
stan_scat(fitSimRegStan, c('nu','sigma'))
stan_dens(fitSimRegStan)
stan_ac(fitSimRegStan, separate_chains = T)
stan_diag(fitSimRegStan,information = "sample",chain=0)
## Warning: Removed 2 rows containing missing values (geom_bar).
stan_diag(fitSimRegStan,information = "stepsize",chain = 0)
stan_diag(fitSimRegStan,information = "treedepth",chain = 0)
stan_diag(fitSimRegStan,information = "divergence",chain = 0)
Work with shinystan object.
launch_shinystan(fitSimRegStan)
##
## Launching ShinyStan interface... for large models this may take some time.
##
## Listening on http://127.0.0.1:3894
Recall that the data in this example contains predictor height and output weight for a group of people from HtWtData300.csv (data above).
Plot all heights observed in the sample and check the summary of the variable.
plot(1:length(dat$x),dat$x)
summary(dat$x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 54.60 64.00 66.20 66.39 69.20 76.00
Can we predict weight of a person who is 50 or 80 inches tall?
To do this we can go through all pairs of simulated parameters (\(\beta_0\), \(\beta_1\)) and use them to simulate \(y(50)\) and \(y(80)\).
This gives distribution of predicted values.
summary(fitSimRegStan)
## $summary
## mean se_mean sd 2.5% 25%
## beta0 -139.457716 0.474473592 27.7647845 -193.116667 -158.286756
## beta1 4.453972 0.007078577 0.4153911 3.624519 4.173722
## nu 5.376267 0.027127112 1.6652200 3.156314 4.244959
## sigma 23.947109 0.026455221 1.6344348 20.863321 22.817115
## lp__ -1264.989875 0.025691887 1.4422166 -1268.608443 -1265.699675
## 50% 75% 97.5% n_eff Rhat
## beta0 -139.844025 -120.857894 -84.010420 3424.243 1.000925
## beta1 4.457688 4.734091 5.252953 3443.678 1.000856
## nu 5.047634 6.095892 9.512076 3768.219 1.000766
## sigma 23.913962 25.040586 27.212601 3816.914 1.002366
## lp__ -1264.665599 -1263.931458 -1263.175565 3151.149 0.999746
##
## $c_summary
## , , chains = chain:1
##
## stats
## parameter mean sd 2.5% 25% 50%
## beta0 -139.555597 27.4597006 -192.105111 -157.412180 -140.500612
## beta1 4.455016 0.4111828 3.635894 4.183359 4.465408
## nu 5.348082 1.5971432 3.218502 4.245957 5.016586
## sigma 23.975451 1.6225573 21.023659 22.797586 23.973171
## lp__ -1264.975332 1.3863499 -1268.395668 -1265.683961 -1264.696240
## stats
## parameter 75% 97.5%
## beta0 -121.592411 -84.278087
## beta1 4.724725 5.244700
## nu 6.074922 9.284072
## sigma 25.072882 27.267627
## lp__ -1263.948771 -1263.188271
##
## , , chains = chain:2
##
## stats
## parameter mean sd 2.5% 25% 50%
## beta0 -137.947426 28.2430234 -193.077997 -157.144437 -138.247678
## beta1 4.432032 0.4223973 3.595172 4.141317 4.430180
## nu 5.474135 1.6888089 3.180908 4.313370 5.167747
## sigma 24.082420 1.6304548 20.919179 23.016345 24.088228
## lp__ -1264.997107 1.4349376 -1268.432280 -1265.703064 -1264.676008
## stats
## parameter 75% 97.5%
## beta0 -118.686252 -81.290100
## beta1 4.719801 5.254270
## nu 6.223180 9.496769
## sigma 25.173914 27.206210
## lp__ -1263.940853 -1263.208054
##
## , , chains = chain:3
##
## stats
## parameter mean sd 2.5% 25% 50%
## beta0 -141.128865 27.0739862 -193.567806 -159.928138 -141.184528
## beta1 4.478245 0.4055515 3.693086 4.198271 4.477226
## nu 5.287256 1.6590084 3.088694 4.172013 4.970867
## sigma 23.785158 1.6439336 20.582666 22.707214 23.716970
## lp__ -1265.001908 1.4557212 -1268.750852 -1265.710011 -1264.661592
## stats
## parameter 75% 97.5%
## beta0 -122.469971 -88.959180
## beta1 4.754764 5.260620
## nu 5.990676 9.458371
## sigma 24.838777 27.132066
## lp__ -1263.917762 -1263.156629
##
## , , chains = chain:4
##
## stats
## parameter mean sd 2.5% 25% 50%
## beta0 -139.198975 28.1886842 -193.066614 -158.635320 -139.280035
## beta1 4.450594 0.4211414 3.612505 4.167589 4.450718
## nu 5.395597 1.7091611 3.175277 4.234994 5.048846
## sigma 23.945407 1.6278086 20.993270 22.825139 23.877884
## lp__ -1264.985151 1.4905988 -1268.896910 -1265.699569 -1264.617903
## stats
## parameter 75% 97.5%
## beta0 -120.442022 -82.393654
## beta1 4.736709 5.249889
## nu 6.082440 9.747841
## sigma 25.032312 27.232596
## lp__ -1263.896917 -1263.168898
regParam<-cbind(Beta0=rstan::extract(fitSimRegStan,pars="beta0")$'beta0',
Beta1=rstan::extract(fitSimRegStan,pars="beta1")$'beta1')
head(regParam)
## Beta0 Beta1
## [1,] -116.0424 4.086300
## [2,] -161.1032 4.779901
## [3,] -158.2478 4.723801
## [4,] -165.5372 4.869262
## [5,] -134.6192 4.379242
## [6,] -153.0722 4.636497
predX50<-apply(regParam,1,function(z) z%*%c(1,50))
predX80<-apply(regParam,1,function(z) z%*%c(1,80))
Plot both distributions, look at their summaries and HDIs.
suppressWarnings(library(HDInterval))
den<-density(predX50)
plot(density(predX80),xlim=c(60,240))
lines(den$x,den$y)
summary(cbind(predX50,predX80))
## predX50 predX80
## Min. : 60.45 Min. :194.9
## 1st Qu.: 78.41 1st Qu.:212.9
## Median : 83.16 Median :216.9
## Mean : 83.24 Mean :216.9
## 3rd Qu.: 88.06 3rd Qu.:220.7
## Max. :111.60 Max. :236.1
rbind(predX50=hdi(predX50),predX80=hdi(predX80))
## lower upper
## predX50 69.20181 97.1803
## predX80 205.39034 227.8176