# setwd("U:/PRJ/Bayesian")
diab = read.csv("/Users/nirajanbudhathoki/OneDrive - Central Michigan University/diabetes.csv")
head(diab)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
# Value of 0 in the dataset refer missing values. Impute missing values with median for all predictors except "Pregnancies".
myfun = function(x){
  ifelse(x== 0,median(x),x)
}

imp.median <- data.frame(
  sapply(diab[,-c(1,9)],myfun))
finaldata = cbind(diab[,c(9,1)],imp.median)
head(finaldata)
##   Outcome Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1       1           6     148            72            35    30.5 33.6
## 2       0           1      85            66            29    30.5 26.6
## 3       1           8     183            64            23    30.5 23.3
## 4       0           1      89            66            23    94.0 28.1
## 5       1           0     137            40            35   168.0 43.1
## 6       0           5     116            74            23    30.5 25.6
##   DiabetesPedigreeFunction Age
## 1                    0.627  50
## 2                    0.351  31
## 3                    0.672  32
## 4                    0.167  21
## 5                    2.288  33
## 6                    0.201  30

Next, we split the data into training and testing sets. Models will be built on training data and their performances will be evaluated on testing data.

set.seed(111)
train = sample(1:nrow(finaldata),nrow(finaldata)*0.8)  #80% data on training set
finaldata.train = finaldata[train,]
finaldata.test = finaldata[-train,]

Let’s see how correlated are the different predictors. High correlation between predictors are problematic if our goal is inference. However, if the goal is prediction, we should not worry much.

library(corrplot)
Cor = cor(finaldata.train[,-1])
corrplot(Cor, type="upper", method="number", tl.pos="d")

Pregnancies and Age seem to have moderate positive correlation. Similar is the story between Skin Thickness and BMI.

The predictors are measured on various units. Therefore, we are going to scale them so that the posterior results become comparable later on. The Cauchy Prior on regression coefficients requires scaling of predictors to have a mean of 0 and standard deviation 0.5. For consistency, we will use the same scaling while using the default normal prior.

X1 = scale(finaldata.train[,-1])*0.5+0
colMeans(X1)
##              Pregnancies                  Glucose            BloodPressure 
##            -2.481728e-17            -8.759788e-17             1.175318e-16 
##            SkinThickness                  Insulin                      BMI 
##             2.853253e-17            -1.558143e-17            -4.838579e-17 
## DiabetesPedigreeFunction                      Age 
##            -5.214765e-18             3.839841e-17
apply(X1,2,sd)
##              Pregnancies                  Glucose            BloodPressure 
##                      0.5                      0.5                      0.5 
##            SkinThickness                  Insulin                      BMI 
##                      0.5                      0.5                      0.5 
## DiabetesPedigreeFunction                      Age 
##                      0.5                      0.5

The models are developed using JAGS which stands for Just Another Gibbs Sampler. JAGS’s functionalities can be accessed from R using the “rjags” package. The first step is to specify the model. As discussed preivously, the first prior on intercept as well as other regression coefficients will be a normal prior with mean 0 and variance 100. These values make the prior non-informative and hence the inferences are typically data-driven. JAGS requires model be written as string.

library(rjags)

mod1_string = " model {
    for (i in 1:length(y)) {
        y[i] ~ dbern(p[i])         # Likelihood portion
        logit(p[i]) = int + b[1]*Pregnancies[i] + b[2]*Glucose[i] + b[3]*BloodPressure[i] + b[4]*SkinThickness[i] + b[5]*Insulin[i] + b[6]*BMI[i] + b[7]*DiabetesPedigreeFunction[i] + b[8]*Age[i]
    }
    int ~ dnorm(0.0, 1.0/100.0)    # Normal prior with mean 0 and variance 100 (equivalently, precision of 1/100).
    for (j in 1:8) {
        b[j] ~ dnorm(0.0, 1.0/100.0)
    }
} "

The second step is to set up the model and tell where the data are in a list.

set.seed(33)

data_jags = list(y=finaldata.train$Outcome, Pregnancies=X1[,"Pregnancies"], Glucose=X1[,"Glucose"], BloodPressure=X1[,"BloodPressure"], SkinThickness=X1[,"SkinThickness"], Insulin=X1[,"Insulin"], BMI=X1[,"BMI"], DiabetesPedigreeFunction=X1[,"DiabetesPedigreeFunction"], Age=X1[,"Age"])

# Parameters we may want to monitor are intercept and the other coefficients.
params = c("int", "b")

# Specify the model itself.
mod1 = jags.model(textConnection(mod1_string), data=data_jags, n.chains=3) # Run three different chains with different starting values
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 614
##    Unobserved stochastic nodes: 9
##    Total graph size: 7893
## 
## Initializing model
# Give a burn-in period of 1000 iterations. Samples are not kept for first 1000 iterations.
update(mod1, 1e3)

# Actual posterior simulations we will keep. Run 5000 iterations.
mod1_sim = coda.samples(model=mod1, variable.names=params, n.iter=5e3) # Simulations are stored as matrices.

# Combine results from 3 different chains into one by stacking matrices that contain simulations.
mod1_csim = as.mcmc(do.call(rbind, mod1_sim))

Let’s perform some convergence diagnostics for the Markov Chains.

# Convergence diagnostics. Start with trace plots.
plot(mod1_sim, ask=TRUE) # Not displayed in the shared file. Different colors for different chains we ran. Look random (no trend) which is desirable.

gelman.diag(mod1_sim)  # Potential scale reduction factors of 1 indicate models have probably converged.
## Potential scale reduction factors:
## 
##      Point est. Upper C.I.
## b[1]          1          1
## b[2]          1          1
## b[3]          1          1
## b[4]          1          1
## b[5]          1          1
## b[6]          1          1
## b[7]          1          1
## b[8]          1          1
## int           1          1
## 
## Multivariate psrf
## 
## 1
autocorr.plot(mod1_sim) # Autocorrelation quickly dropped to near zero within first 5 lags. No autocorrelation issue in the estimated coefficients. 

Let’s also see the model summaries.

summary(mod1_sim)
## 
## Iterations = 2001:7000
## Thinning interval = 1 
## Number of chains = 3 
## Sample size per chain = 5000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##          Mean     SD  Naive SE Time-series SE
## b[1]  0.84304 0.2455 0.0020045       0.003346
## b[2]  2.36299 0.2691 0.0021968       0.003563
## b[3] -0.25768 0.2314 0.0018890       0.002804
## b[4]  0.03753 0.2464 0.0020117       0.003508
## b[5] -0.32622 0.2274 0.0018566       0.003087
## b[6]  1.42252 0.2781 0.0022706       0.004232
## b[7]  0.44512 0.2200 0.0017965       0.002387
## b[8]  0.34190 0.2479 0.0020241       0.003589
## int  -0.88238 0.1095 0.0008945       0.001226
## 
## 2. Quantiles for each variable:
## 
##           2.5%     25%      50%      75%   97.5%
## b[1]  0.372007  0.6778  0.84102  1.00426  1.3319
## b[2]  1.845652  2.1792  2.35956  2.54109  2.9060
## b[3] -0.713145 -0.4133 -0.25605 -0.09976  0.1901
## b[4] -0.445537 -0.1285  0.03759  0.20509  0.5216
## b[5] -0.777602 -0.4775 -0.32502 -0.17320  0.1163
## b[6]  0.891625  1.2358  1.41863  1.60220  1.9853
## b[7]  0.009256  0.3008  0.44473  0.59275  0.8803
## b[8] -0.147936  0.1771  0.34373  0.51121  0.8239
## int  -1.100709 -0.9553 -0.88153 -0.80756 -0.6703

Prediction

If we have the regression coefficients and the predictor values, we can plug them into the second equation for \(p_i\) above to get an estimate of the probability that the Outcome = 1.

# Extract posterior mean of coefficients
pm_coef = colMeans(mod1_csim)

# The matrix multiplication below gives the exponentiation part in equation which will then be used to find estimated probabilities.

pm_Xb = pm_coef["int"] + X1[,c(1,2,3,4,5,6,7,8)] %*% pm_coef[1:8] # Intercept + Design Matrix*Coefficients
phat = 1.0 / (1.0 + exp(-pm_Xb))  # Predicted probabilities that the Outcome = 1 for each observations

The plot of predicted probabilities against the actual outcome value gives a rough idea on how successful the model is on the training dataset.

plot(phat, jitter(finaldata.train$Outcome))

Looks okay. Observations with lower probabilities of Outcome=0 assigned by the model were often actually 0 in the dataset. It would be more interesting to see this result in the test dataset.

Let’s select 0.5 as the cut-off. Probabilities greater than 0.5 will be labeled ‘1’(Presence of Diabetes) as the outcome and below 0.5 will be labeled ‘0’ (Absence of Diabetes).

(tab0.5 = table(phat > 0.5, finaldata.train$Outcome))
##        
##           0   1
##   FALSE 353  94
##   TRUE   48 119
sum(diag(tab0.5)) / sum(tab0.5)  # Correct classification rate in the training dataset
## [1] 0.7687296

Now, let’s see the model’s performance in the test dataset. Again, we start by standardizing the data.

X2 = scale(finaldata.test[,-1])*0.5+0

Now, using the coefficients obtained, let’s find the predicted probabilities of individual observations in the test dataset.

pm_coef = colMeans(mod1_csim)
pm_Xb = pm_coef["int"] + X2[,c(1,2,3,4,5,6,7,8)] %*% pm_coef[1:8] # Intercept + Design Matrix*Coefficients
phat = 1.0 / (1.0 + exp(-pm_Xb))

Model performance in the test dataset.

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
plot(phat, jitter(finaldata.test$Outcome))

(tab0.5 = table(phat > 0.5, finaldata.test$Outcome))
##        
##          0  1
##   FALSE 88 22
##   TRUE  11 33
sum(diag(tab0.5)) / sum(tab0.5)  # Correct classification rate in the training dataset
## [1] 0.7857143
roc(finaldata.test$Outcome, as.numeric(phat),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(phat),     plot = TRUE, legacy.axes = TRUE)
## 
## Data: as.numeric(phat) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.8492

Similar accuracy as the training data is found. Next, we want to proceed with a different prior and compare the performances of the two models. This time we will use the weakly informative Cauchy priors for the coefficients. As recommended by Gelman et.al (2008), data are first standardized so that all continuous variables have mean 0 and standard deviation 0.5. Then the intercept will be specified a Cauchy prior distribution centered at 0 and scale of 10. The other coefficients will get a scale of 2.5. Again, let’s set up, specify and run the model in training data.

Feb 16

Take only significant variables

# Extract posterior mean of coefficients
pm_coef = colMeans(mod1_csim)

# The matrix multiplication below gives the exponentiation part in equation which will then be used to find estimated probabilities.

pm_Xb = pm_coef["int"] + X1[,c(1,2,6,7)] %*% pm_coef[c(1,2,6,7)] # Intercept + Design Matrix*Coefficients
phat = 1.0 / (1.0 + exp(-pm_Xb))  # Predicted probabilities that the Outcome = 1 for each observations
plot(phat, jitter(finaldata.train$Outcome))

(tab0.5 = table(phat > 0.5, finaldata.train$Outcome))
##        
##           0   1
##   FALSE 357  94
##   TRUE   44 119
sum(diag(tab0.5)) / sum(tab0.5)  # Correct classification rate in the training dataset
## [1] 0.7752443
X2 = scale(finaldata.test[,-1])*0.5+0

Now, using the coefficients obtained, let’s find the predicted probabilities of individual observations in the test dataset.

pm_coef = colMeans(mod1_csim)
pm_Xb = pm_coef["int"] + X2[,c(1,2,6,7)] %*% pm_coef[c(1,2,6,7)] # Intercept + Design Matrix*Coefficients
phat = 1.0 / (1.0 + exp(-pm_Xb))

Model performance in the test dataset.

library(pROC)
plot(phat, jitter(finaldata.test$Outcome))

(tab0.5 = table(phat > 0.5, finaldata.test$Outcome))
##        
##          0  1
##   FALSE 86 24
##   TRUE  13 31
sum(diag(tab0.5)) / sum(tab0.5)  # Correct classification rate in the training dataset
## [1] 0.7597403
roc(finaldata.test$Outcome, as.numeric(phat),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(phat),     plot = TRUE, legacy.axes = TRUE)
## 
## Data: as.numeric(phat) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.8507

With Cauchy Prior

mod2_string = " model {
    for (i in 1:length(y)) {
        y[i] ~ dbern(p[i])
        logit(p[i]) = int + b[1]*Pregnancies[i] + b[2]*Glucose[i] + b[3]*BloodPressure[i] + b[4]*SkinThickness[i] + b[5]*Insulin[i] + b[6]*BMI[i] + b[7]*DiabetesPedigreeFunction[i] + b[8]*Age[i]
    }
    int ~ dt(0, 1/10^2, 1)    # t prior with mean 0 and scale 10.This is weakly informative chaucy prior.
    for (j in 1:8) {
        b[j] ~ dt(0, 1/2.5^2, 1)  #  t prior with mean 0 and scale 2.5
    }
}"

Specify the parameters and run MCMC.

set.seed(44)

data_jags = list(y=finaldata.train$Outcome, Pregnancies=X1[,"Pregnancies"], Glucose=X1[,"Glucose"], BloodPressure=X1[,"BloodPressure"], SkinThickness=X1[,"SkinThickness"], Insulin=X1[,"Insulin"], BMI=X1[,"BMI"], DiabetesPedigreeFunction=X1[,"DiabetesPedigreeFunction"], Age=X1[,"Age"])

# Parameters we may want to monitor are intercept and the other coefficients.
params = c("int", "b")

# Specify the model itself.
mod2 = jags.model(textConnection(mod2_string), data=data_jags, n.chains=3) # Run three different chains with different starting values
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 614
##    Unobserved stochastic nodes: 9
##    Total graph size: 7898
## 
## Initializing model
# Give a burn-in period of 1000 iterations. Samples are not kept for first 1000 iterations.
update(mod2, 1e3)

# Actual posterior simulations we will keep. Run 5000 iterations.
mod2_sim = coda.samples(model=mod2, variable.names=params, n.iter=5e3) # Simulations are stored as matrices.

# Combine results from 3 different chains into one by stacking matrices that contain simulations.
mod2_csim = as.mcmc(do.call(rbind, mod2_sim))

Again, let’s perform some convergence diagnostics.

# Convergence diagnostics. Start with trace plots.
plot(mod2_sim, ask=TRUE) # Not displayed in the shared file. Different colors for different chains we ran. Look random (no trend) which is desirable.

gelman.diag(mod2_sim)  # Potential scale reduction factors of 1 indicate models have probably converged.
## Potential scale reduction factors:
## 
##      Point est. Upper C.I.
## b[1]          1          1
## b[2]          1          1
## b[3]          1          1
## b[4]          1          1
## b[5]          1          1
## b[6]          1          1
## b[7]          1          1
## b[8]          1          1
## int           1          1
## 
## Multivariate psrf
## 
## 1
autocorr.plot(mod2_sim) # Autocorrelation quickly dropped to near zero within first 5 lags. No autocorrelation issue in the estimated coefficients.

Model summary again.

summary(mod2_sim)
## 
## Iterations = 2001:7000
## Thinning interval = 1 
## Number of chains = 3 
## Sample size per chain = 5000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##          Mean     SD Naive SE Time-series SE
## b[1]  0.82762 0.2433 0.001986       0.003313
## b[2]  2.33259 0.2649 0.002163       0.003319
## b[3] -0.23768 0.2279 0.001861       0.002745
## b[4]  0.03956 0.2478 0.002023       0.003474
## b[5] -0.30836 0.2194 0.001791       0.002817
## b[6]  1.39416 0.2725 0.002225       0.003882
## b[7]  0.43568 0.2204 0.001800       0.002272
## b[8]  0.33740 0.2494 0.002036       0.003551
## int  -0.87575 0.1085 0.000886       0.001210
## 
## 2. Quantiles for each variable:
## 
##          2.5%     25%      50%      75%   97.5%
## b[1]  0.35705  0.6679  0.82537  0.98822  1.3126
## b[2]  1.82230  2.1526  2.32745  2.50975  2.8540
## b[3] -0.67670 -0.3949 -0.23839 -0.08493  0.2101
## b[4] -0.45467 -0.1249  0.04031  0.20844  0.5226
## b[5] -0.74625 -0.4532 -0.30718 -0.16375  0.1221
## b[6]  0.86398  1.2122  1.39713  1.57598  1.9286
## b[7]  0.00111  0.2857  0.43354  0.58364  0.8718
## b[8] -0.15613  0.1695  0.34060  0.50509  0.8187
## int  -1.08961 -0.9486 -0.87504 -0.80076 -0.6696

Prediction

# Extract posterior mean of coefficients
pm_coef2 = colMeans(mod2_csim)

# The matrix multiplication below gives the exponentiation part in equation which will then be used to find estimated probabilities.

pm_Xb2 = pm_coef2["int"] + X1[,c(1,2,6,7)] %*% pm_coef2[c(1,2,6,7)] # Intercept + Design Matrix*Coefficients
phat2 = 1.0 / (1.0 + exp(-pm_Xb2))  # Predicted probabilities that the Outcome = 1 for each observations
plot(phat2, jitter(finaldata.train$Outcome))

(tab0.5 = table(phat2 > 0.5, finaldata.train$Outcome))
##        
##           0   1
##   FALSE 357  96
##   TRUE   44 117
sum(diag(tab0.5)) / sum(tab0.5)  # Correct classification rate in the training dataset
## [1] 0.771987

Now, let’s see the performance in test dataset.

pm_coef2 = colMeans(mod2_csim)
pm_Xb2 = pm_coef2["int"] + X2[,c(1,2,6,7)] %*% pm_coef2[c(1,2,6,7)] # Intercept + Design Matrix*Coefficients
phat2 = 1.0 / (1.0 + exp(-pm_Xb2))
plot(phat2, jitter(finaldata.test$Outcome))

(tab0.5 = table(phat2 > 0.5, finaldata.test$Outcome))
##        
##          0  1
##   FALSE 86 24
##   TRUE  13 31
sum(diag(tab0.5)) / sum(tab0.5)  # Correct classification rate in the testing dataset
## [1] 0.7597403
roc(finaldata.test$Outcome, as.numeric(phat2),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(phat2),     plot = TRUE, legacy.axes = TRUE)
## 
## Data: as.numeric(phat2) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.8505