Pima Indians Diabetes Classification

Introduction

This markdown is based on the Pima Indians Diabetes dataset available on the UCI Machine Learning library.

You can find more information about the dataset here

I will use 4 methods namely Logistic Regression, Bayesian Logistic Regression, Decision Trees & Random Forests to solve the problem, and then compare the accuracy between these methods.

For the future: Test the ADAP algorithm

Data Initialization

library(ggplot2)
library(dplyr)
library(readr)
library(corrplot)
library(caret)
library(ROCR)
library(tree)
library(randomForest)
library(rstanarm)
library(pROC)

pima <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data",col_names=c("Pregnant","Plasma_Glucose","Dias_BP","Triceps_Skin","Serum_Insulin","BMI","DPF","Age","Diabetes"))

#Lets count the no. of NA values
sapply(pima, function(x) sum(is.na(x)))

##       Pregnant Plasma_Glucose        Dias_BP   Triceps_Skin  Serum_Insulin 
##              0              0              0              0              0 
##            BMI            DPF            Age       Diabetes 
##              0              0              0              0

#Lets look at a basic summary for the variables
summary(pima)

##     Pregnant      Plasma_Glucose     Dias_BP        Triceps_Skin  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##  Serum_Insulin        BMI             DPF              Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##     Diabetes    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

#Convert the Diabetes variable into a factor
pima$Diabetes <- as.factor(pima$Diabetes)

#Remove entry if Glucose,BP or BMI == 0
pima <- pima[apply(pima[,c(2,3,6)],1,function(x) !any(x==0)),]

Lets dive into the problem

Data Vizualization

gluc_mean <- pima %>% group_by(Diabetes) %>% summarise(Plas = round(mean(Plasma_Glucose),2))

#Relationship between Plasma Glucose & Diabetes
ggplot(data=pima,aes(Diabetes,Plasma_Glucose)) + 
      geom_boxplot(aes(fill=Diabetes)) + stat_boxplot(geom = "errorbar") + 
      ggtitle("Diabetes rates against Plasma Glucose Levels") + 
      xlab("Diabetes") + ylab("Plasma Glucose") + guides(fill=F) + 
      geom_text(data = gluc_mean, aes(x=Diabetes,y=Plas,label=Plas),
                hjust = -1.5,vjust=-0.5)

ins_mean <- pima %>% group_by(Diabetes) %>% summarise(Plas = round(mean(Serum_Insulin),2))

#Relationship between Diabetes & Serum Insulin levels
ggplot(data=pima,aes(Diabetes,Serum_Insulin)) + 
      geom_boxplot(aes(fill=Diabetes)) + stat_boxplot(geom = "errorbar") + 
      ggtitle("Diabetes rates against Serum Insulin Levels") + 
      xlab("Diabetes") + ylab("Serum Insulin") + guides(fill=F) + 
      geom_text(data = ins_mean, aes(x=Diabetes,y=Plas,label=Plas),
                hjust = -1.5,vjust=-0.5)

#Difference in Blood Pressure & BMI for Diabetics
ggplot(data = pima,aes(Dias_BP,BMI)) + geom_point(aes(colour=Diabetes),alpha=0.6) +
      xlab("Diastolic Blood Pressure") + ylab("Body Mass Index") + 
      ggtitle("Interaction between Blood Pressure & BMI for Diabetics") + 
      labs(colour="Diabetes Status") + 
      scale_colour_manual(values = c("#D55E00", "#009E73"))

#Relationship between pregnancy and diabetes
ggplot(pima, aes(Pregnant, fill = Diabetes)) +
      geom_density() + ylab("Distribution of Pregnancy") + 
      ggtitle("Pregnant Women vs. the threat of Diabetes")

#Correlelogram to help remove variables which may be correlated to one another
corrplot(cor(pima[,-9]),type = "lower", method = "number")

We can see from the correlogram that none of the variables are very highly correlated to each other. We will move forward and use all the variables in our exploratory models.

Logistic Regression

set.seed(15689)
index <- createDataPartition(pima$Diabetes,p = 0.7,list = F)
train <- pima[index,]
test  <- pima[-index,]

#Logistic Regression

m1 <- glm(Diabetes ~ ., data = train, family = binomial(link = "logit"))
summary(m1)

## 
## Call:
## glm(formula = Diabetes ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7462  -0.7373  -0.4213   0.7777   2.3671  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -8.5194860  0.9591930  -8.882  < 2e-16 ***
## Pregnant        0.1229543  0.0388712   3.163  0.00156 ** 
## Plasma_Glucose  0.0333662  0.0045310   7.364 1.78e-13 ***
## Dias_BP        -0.0069527  0.0106329  -0.654  0.51319    
## Triceps_Skin   -0.0006152  0.0084432  -0.073  0.94192    
## Serum_Insulin  -0.0004671  0.0010534  -0.443  0.65744    
## BMI             0.0859095  0.0193407   4.442 8.92e-06 ***
## DPF             1.1047436  0.3510801   3.147  0.00165 ** 
## Age             0.0107850  0.0121142   0.890  0.37332    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 654.27  on 507  degrees of freedom
## Residual deviance: 489.46  on 499  degrees of freedom
## AIC: 507.46
## 
## Number of Fisher Scoring iterations: 5

anova(m1,test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Diabetes
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             507     654.27              
## Pregnant        1   25.498       506     628.77 4.427e-07 ***
## Plasma_Glucose  1   99.981       505     528.79 < 2.2e-16 ***
## Dias_BP         1    0.993       504     527.80  0.319063    
## Triceps_Skin    1    4.826       503     522.97  0.028039 *  
## Serum_Insulin   1    0.059       502     522.91  0.807291    
## BMI             1   22.087       501     500.83 2.605e-06 ***
## DPF             1   10.576       500     490.25  0.001146 ** 
## Age             1    0.787       499     489.46  0.375008    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Remodel using only the significant variables
mod_fin <- glm(Diabetes ~ Pregnant + Plasma_Glucose + Triceps_Skin + BMI + DPF,
               data = train, family = binomial(link = "logit"))
summary(mod_fin)

## 
## Call:
## glm(formula = Diabetes ~ Pregnant + Plasma_Glucose + Triceps_Skin + 
##     BMI + DPF, family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8389  -0.7415  -0.4282   0.7705   2.4019  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -8.552272   0.833044 -10.266  < 2e-16 ***
## Pregnant        0.138389   0.032304   4.284 1.84e-05 ***
## Plasma_Glucose  0.033073   0.004102   8.064 7.41e-16 ***
## Triceps_Skin   -0.002348   0.007692  -0.305  0.76018    
## BMI             0.081597   0.018225   4.477 7.56e-06 ***
## DPF             1.112942   0.348642   3.192  0.00141 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 654.27  on 507  degrees of freedom
## Residual deviance: 490.60  on 502  degrees of freedom
## AIC: 502.6
## 
## Number of Fisher Scoring iterations: 5

#Residuals 
summary(residuals(mod_fin))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.8390 -0.7415 -0.4282 -0.0816  0.7705  2.4020

par(mfrow=c(2,2))
plot(mod_fin)

#Apply the model to the testing sample
test_pred <- predict(mod_fin,test, type = "response")
pred_test <- as.data.frame(cbind(test$Diabetes,test_pred))
colnames(pred_test) <- c("Original","Test_pred")
pred_test$outcome <- ifelse(pred_test$Test_pred > 0.5, 1, 0)
error <- mean(pred_test$outcome != test$Diabetes)
print(paste('Test Data Accuracy', round(1-error,2)*100,'%'))

## [1] "Test Data Accuracy 81 %"

confusionMatrix(test$Diabetes,pred_test$outcome)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 131  11
##          1  29  45
##                                           
##                Accuracy : 0.8148          
##                  95% CI : (0.7565, 0.8643)
##     No Information Rate : 0.7407          
##     P-Value [Acc > NIR] : 0.006616        
##                                           
##                   Kappa : 0.5635          
##  Mcnemar's Test P-Value : 0.007190        
##                                           
##             Sensitivity : 0.8187          
##             Specificity : 0.8036          
##          Pos Pred Value : 0.9225          
##          Neg Pred Value : 0.6081          
##              Prevalence : 0.7407          
##          Detection Rate : 0.6065          
##    Detection Prevalence : 0.6574          
##       Balanced Accuracy : 0.8112          
##                                           
##        'Positive' Class : 0               
##

acc_lg <- confusionMatrix(test$Diabetes,pred_test$outcome)$overall['Accuracy']

# Get the ROC curve and the AUC
par(mfrow=c(1,1))
plot.roc(test$Diabetes,test_pred,percent=TRUE,col="#1c61b6",print.auc=TRUE,
         main = "Area under the curve for Logistic Regression")

## 
## Call:
## plot.roc.default(x = test$Diabetes, predictor = test_pred, percent = TRUE,     col = "#1c61b6", print.auc = TRUE, main = "Area under the curve for Logistic Regression")
## 
## Data: test_pred in 142 controls (test$Diabetes 0) < 74 cases (test$Diabetes 1).
## Area under the curve: 87.32%

Bayesian Logistic Regression

For a Binomial GLM such as the case above, the likelihood of each observation is calculated using: \[(ny)πy(1−π)n−y\]

Because this is a probability based equation we use a link function, in this case logit to map the unit interval to a set of Real Numbers.

The difference in using a Bayesian approach is to specify prior distributions to the intercept and regression co-effecient terms. This helps us incorporate our beliefs of how the intercepts will turn out to be into the equation. We can specify this using any standard continuous distribution.

You can read more about Bayesian Logistic Regression here:

#Bayesian Logistic Regression
prior_dist <- student_t(df = 7, location = 0, scale = 2.5)
bayes_mod  <- stan_glm(Diabetes ~ ., data = train,
                  family = binomial(link = "logit"), 
                  prior = prior_dist, prior_intercept = prior_dist,
                  seed = 15689)

## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## 
## Chain 1, Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1, Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1, Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1, Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1, Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1, Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1, Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1, Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1, Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1, Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1, Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1, Iteration: 2000 / 2000 [100%]  (Sampling)
##  Elapsed Time: 1.15819 seconds (Warm-up)
##                1.25645 seconds (Sampling)
##                2.41463 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## 
## Chain 2, Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2, Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2, Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2, Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2, Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2, Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2, Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2, Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2, Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2, Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2, Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2, Iteration: 2000 / 2000 [100%]  (Sampling)
##  Elapsed Time: 1.18468 seconds (Warm-up)
##                1.20156 seconds (Sampling)
##                2.38624 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## 
## Chain 3, Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3, Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3, Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3, Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3, Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3, Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3, Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3, Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3, Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3, Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3, Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3, Iteration: 2000 / 2000 [100%]  (Sampling)
##  Elapsed Time: 1.22443 seconds (Warm-up)
##                0.976971 seconds (Sampling)
##                2.2014 seconds (Total)
## 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## 
## Chain 4, Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4, Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4, Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4, Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4, Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4, Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4, Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4, Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4, Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4, Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4, Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4, Iteration: 2000 / 2000 [100%]  (Sampling)
##  Elapsed Time: 1.20294 seconds (Warm-up)
##                1.21009 seconds (Sampling)
##                2.41303 seconds (Total)

#Confidence Intervals for the predictors
posterior_interval(bayes_mod, prob = 0.95)

##                         2.5%        97.5%
## (Intercept)    -10.519516602 -6.768354355
## Pregnant         0.048804720  0.198337786
## Plasma_Glucose   0.024937244  0.042726827
## Dias_BP         -0.027183802  0.014188527
## Triceps_Skin    -0.017526337  0.016359135
## Serum_Insulin   -0.002479761  0.001594718
## BMI              0.050794450  0.125171100
## DPF              0.426009682  1.802053392
## Age             -0.013343434  0.033615016

#Residuals for the Bayesian Model
summary(residuals(bayes_mod))

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.977900 -0.234900 -0.083110  0.000999  0.258700  0.940900

bayes_res <- data.frame(residuals(bayes_mod))
bayes_res$index <- seq.int(nrow(bayes_res)) 
colnames(bayes_res) <- "Residuals"

#Plotting the residuals
ggplot(data = bayes_res,aes(index,Residuals)) + geom_point() + ggtitle("Representation of randomness amongst Residuals")

ggplot(data = bayes_res,aes(Residuals)) + geom_density(aes(fill=Residuals)) + 
ylab("Density") + ggtitle("Distribution of Residuals")

#Predicting Probabilities for the test data
pred <- posterior_linpred(bayes_mod, newdata = test, transform=TRUE)
fin_pred <- colMeans(pred)
test_prediction <- as.integer(fin_pred >= 0.5)

confusionMatrix(test$Diabetes,test_prediction)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 129  13
##          1  27  47
##                                           
##                Accuracy : 0.8148          
##                  95% CI : (0.7565, 0.8643)
##     No Information Rate : 0.7222          
##     P-Value [Acc > NIR] : 0.00107         
##                                           
##                   Kappa : 0.5694          
##  Mcnemar's Test P-Value : 0.03983         
##                                           
##             Sensitivity : 0.8269          
##             Specificity : 0.7833          
##          Pos Pred Value : 0.9085          
##          Neg Pred Value : 0.6351          
##              Prevalence : 0.7222          
##          Detection Rate : 0.5972          
##    Detection Prevalence : 0.6574          
##       Balanced Accuracy : 0.8051          
##                                           
##        'Positive' Class : 0               
##

acc_bayes <- confusionMatrix(test$Diabetes,test_prediction)$overall['Accuracy']

plot.roc(test$Diabetes,fin_pred,percent=TRUE,col="#1c61b6", print.auc=TRUE,
         main = "Area under the curve for Bayesian Logistic Regression")

## 
## Call:
## plot.roc.default(x = test$Diabetes, predictor = fin_pred, percent = TRUE,     col = "#1c61b6", print.auc = TRUE, main = "Area under the curve for Bayesian Logistic Regression")
## 
## Data: fin_pred in 142 controls (test$Diabetes 0) < 74 cases (test$Diabetes 1).
## Area under the curve: 87.62%

Decision Trees & Random Forests

#Decision Trees
set.seed(15689)
m_dt <- tree(Diabetes ~ ., data = train)
pred_dt <- predict(m_dt, train, type = "class")
confusionMatrix(train$Diabetes,pred_dt)[2:3]

## $table
##           Reference
## Prediction   0   1
##          0 293  40
##          1  55 120
## 
## $overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.129921e-01   5.773340e-01   7.763139e-01   8.459794e-01   6.850394e-01 
## AccuracyPValue  McnemarPValue 
##   5.547677e-11   1.508972e-01

plot(m_dt)
text(m_dt, pretty = 0)

pred_dt_test <- predict(m_dt, test, type = "class")
confusionMatrix(test$Diabetes,pred_dt_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 108  34
##          1  30  44
##                                          
##                Accuracy : 0.7037         
##                  95% CI : (0.638, 0.7637)
##     No Information Rate : 0.6389         
##     P-Value [Acc > NIR] : 0.02664        
##                                          
##                   Kappa : 0.3506         
##  Mcnemar's Test P-Value : 0.70766        
##                                          
##             Sensitivity : 0.7826         
##             Specificity : 0.5641         
##          Pos Pred Value : 0.7606         
##          Neg Pred Value : 0.5946         
##              Prevalence : 0.6389         
##          Detection Rate : 0.5000         
##    Detection Prevalence : 0.6574         
##       Balanced Accuracy : 0.6734         
##                                          
##        'Positive' Class : 0              
##

acc_dt <- confusionMatrix(pred_dt_test,test$Diabetes)$overall['Accuracy']

#Random Forest
set.seed(15689)

opt_mod <- tuneRF(train[-as.numeric(ncol(train))],train$Diabetes,ntreeTry = 150, 
           stepFactor = 2, improve = 0.05,trace = T, plot = T, doBest = F)

## mtry = 2  OOB error = 25.59% 
## Searching left ...
## mtry = 1     OOB error = 26.57% 
## -0.03846154 0.05 
## Searching right ...
## mtry = 4     OOB error = 25% 
## 0.02307692 0.05

mtry_fin <- opt_mod[as.numeric(which.min(opt_mod[,"OOBError"])),"mtry"]

rf_fin <- randomForest(Diabetes~.,data=train, mtry=mtry_fin, ntree=101, 
          keep.forest=TRUE, proximity=TRUE, importance=TRUE,test=test)

pred_test <- predict(rf_fin, newdata = test)
confusionMatrix(test$Diabetes,pred_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 117  25
##          1  23  51
##                                           
##                Accuracy : 0.7778          
##                  95% CI : (0.7164, 0.8314)
##     No Information Rate : 0.6481          
##     P-Value [Acc > NIR] : 2.515e-05       
##                                           
##                   Kappa : 0.5098          
##  Mcnemar's Test P-Value : 0.8852          
##                                           
##             Sensitivity : 0.8357          
##             Specificity : 0.6711          
##          Pos Pred Value : 0.8239          
##          Neg Pred Value : 0.6892          
##              Prevalence : 0.6481          
##          Detection Rate : 0.5417          
##    Detection Prevalence : 0.6574          
##       Balanced Accuracy : 0.7534          
##                                           
##        'Positive' Class : 0               
##

acc_rf <- confusionMatrix(test$Diabetes,pred_test)$overall['Accuracy']

par(mfrow=c(1,2))
varImpPlot(rf_fin,type = 2,main = "Variable Importance",col = 'black')
plot(rf_fin,main = "Error vs no. of trees grown")

Model Comparison

accuracy <- data.frame(Model=c("Logistic","Bayesian Logistic","Decision Tree","Random Forest"),Accuracy=c(acc_lg,acc_bayes,acc_dt,acc_rf))

ggplot(accuracy,aes(x=Model,y=Accuracy))+geom_bar(stat='identity')+theme_bw()+
ggtitle('Comparison of Model Accuracy')

From the output it is pretty evident that all the 4 models behave almost similar to each other with very little difference between the models. In order to improve accuracy we can use higher dimension classifiers such as Boosting algorithms, but for the purposes for this exercise I will not be doing it. This was just a simple workbook in order to help myself explore Bayesian Logistic Regression.