This markdown is based on the Pima Indians Diabetes dataset available on the UCI Machine Learning library.
You can find more information about the dataset here
I will use 4 methods namely Logistic Regression, Bayesian Logistic Regression, Decision Trees & Random Forests to solve the problem, and then compare the accuracy between these methods.
For the future: Test the ADAP algorithm
library(ggplot2)
library(dplyr)
library(readr)
library(corrplot)
library(caret)
library(ROCR)
library(tree)
library(randomForest)
library(rstanarm)
library(pROC)
pima <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data",col_names=c("Pregnant","Plasma_Glucose","Dias_BP","Triceps_Skin","Serum_Insulin","BMI","DPF","Age","Diabetes"))
#Lets count the no. of NA values
sapply(pima, function(x) sum(is.na(x)))
## Pregnant Plasma_Glucose Dias_BP Triceps_Skin Serum_Insulin
## 0 0 0 0 0
## BMI DPF Age Diabetes
## 0 0 0 0
#Lets look at a basic summary for the variables
summary(pima)
## Pregnant Plasma_Glucose Dias_BP Triceps_Skin
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Serum_Insulin BMI DPF Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Diabetes
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
#Convert the Diabetes variable into a factor
pima$Diabetes <- as.factor(pima$Diabetes)
#Remove entry if Glucose,BP or BMI == 0
pima <- pima[apply(pima[,c(2,3,6)],1,function(x) !any(x==0)),]
gluc_mean <- pima %>% group_by(Diabetes) %>% summarise(Plas = round(mean(Plasma_Glucose),2))
#Relationship between Plasma Glucose & Diabetes
ggplot(data=pima,aes(Diabetes,Plasma_Glucose)) +
geom_boxplot(aes(fill=Diabetes)) + stat_boxplot(geom = "errorbar") +
ggtitle("Diabetes rates against Plasma Glucose Levels") +
xlab("Diabetes") + ylab("Plasma Glucose") + guides(fill=F) +
geom_text(data = gluc_mean, aes(x=Diabetes,y=Plas,label=Plas),
hjust = -1.5,vjust=-0.5)
ins_mean <- pima %>% group_by(Diabetes) %>% summarise(Plas = round(mean(Serum_Insulin),2))
#Relationship between Diabetes & Serum Insulin levels
ggplot(data=pima,aes(Diabetes,Serum_Insulin)) +
geom_boxplot(aes(fill=Diabetes)) + stat_boxplot(geom = "errorbar") +
ggtitle("Diabetes rates against Serum Insulin Levels") +
xlab("Diabetes") + ylab("Serum Insulin") + guides(fill=F) +
geom_text(data = ins_mean, aes(x=Diabetes,y=Plas,label=Plas),
hjust = -1.5,vjust=-0.5)
#Difference in Blood Pressure & BMI for Diabetics
ggplot(data = pima,aes(Dias_BP,BMI)) + geom_point(aes(colour=Diabetes),alpha=0.6) +
xlab("Diastolic Blood Pressure") + ylab("Body Mass Index") +
ggtitle("Interaction between Blood Pressure & BMI for Diabetics") +
labs(colour="Diabetes Status") +
scale_colour_manual(values = c("#D55E00", "#009E73"))
#Relationship between pregnancy and diabetes
ggplot(pima, aes(Pregnant, fill = Diabetes)) +
geom_density() + ylab("Distribution of Pregnancy") +
ggtitle("Pregnant Women vs. the threat of Diabetes")
#Correlelogram to help remove variables which may be correlated to one another
corrplot(cor(pima[,-9]),type = "lower", method = "number")
We can see from the correlogram that none of the variables are very highly correlated to each other. We will move forward and use all the variables in our exploratory models.
set.seed(15689)
index <- createDataPartition(pima$Diabetes,p = 0.7,list = F)
train <- pima[index,]
test <- pima[-index,]
#Logistic Regression
m1 <- glm(Diabetes ~ ., data = train, family = binomial(link = "logit"))
summary(m1)
##
## Call:
## glm(formula = Diabetes ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7462 -0.7373 -0.4213 0.7777 2.3671
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.5194860 0.9591930 -8.882 < 2e-16 ***
## Pregnant 0.1229543 0.0388712 3.163 0.00156 **
## Plasma_Glucose 0.0333662 0.0045310 7.364 1.78e-13 ***
## Dias_BP -0.0069527 0.0106329 -0.654 0.51319
## Triceps_Skin -0.0006152 0.0084432 -0.073 0.94192
## Serum_Insulin -0.0004671 0.0010534 -0.443 0.65744
## BMI 0.0859095 0.0193407 4.442 8.92e-06 ***
## DPF 1.1047436 0.3510801 3.147 0.00165 **
## Age 0.0107850 0.0121142 0.890 0.37332
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 654.27 on 507 degrees of freedom
## Residual deviance: 489.46 on 499 degrees of freedom
## AIC: 507.46
##
## Number of Fisher Scoring iterations: 5
anova(m1,test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Diabetes
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 507 654.27
## Pregnant 1 25.498 506 628.77 4.427e-07 ***
## Plasma_Glucose 1 99.981 505 528.79 < 2.2e-16 ***
## Dias_BP 1 0.993 504 527.80 0.319063
## Triceps_Skin 1 4.826 503 522.97 0.028039 *
## Serum_Insulin 1 0.059 502 522.91 0.807291
## BMI 1 22.087 501 500.83 2.605e-06 ***
## DPF 1 10.576 500 490.25 0.001146 **
## Age 1 0.787 499 489.46 0.375008
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Remodel using only the significant variables
mod_fin <- glm(Diabetes ~ Pregnant + Plasma_Glucose + Triceps_Skin + BMI + DPF,
data = train, family = binomial(link = "logit"))
summary(mod_fin)
##
## Call:
## glm(formula = Diabetes ~ Pregnant + Plasma_Glucose + Triceps_Skin +
## BMI + DPF, family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8389 -0.7415 -0.4282 0.7705 2.4019
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.552272 0.833044 -10.266 < 2e-16 ***
## Pregnant 0.138389 0.032304 4.284 1.84e-05 ***
## Plasma_Glucose 0.033073 0.004102 8.064 7.41e-16 ***
## Triceps_Skin -0.002348 0.007692 -0.305 0.76018
## BMI 0.081597 0.018225 4.477 7.56e-06 ***
## DPF 1.112942 0.348642 3.192 0.00141 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 654.27 on 507 degrees of freedom
## Residual deviance: 490.60 on 502 degrees of freedom
## AIC: 502.6
##
## Number of Fisher Scoring iterations: 5
#Residuals
summary(residuals(mod_fin))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.8390 -0.7415 -0.4282 -0.0816 0.7705 2.4020
par(mfrow=c(2,2))
plot(mod_fin)
#Apply the model to the testing sample
test_pred <- predict(mod_fin,test, type = "response")
pred_test <- as.data.frame(cbind(test$Diabetes,test_pred))
colnames(pred_test) <- c("Original","Test_pred")
pred_test$outcome <- ifelse(pred_test$Test_pred > 0.5, 1, 0)
error <- mean(pred_test$outcome != test$Diabetes)
print(paste('Test Data Accuracy', round(1-error,2)*100,'%'))
## [1] "Test Data Accuracy 81 %"
confusionMatrix(test$Diabetes,pred_test$outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 131 11
## 1 29 45
##
## Accuracy : 0.8148
## 95% CI : (0.7565, 0.8643)
## No Information Rate : 0.7407
## P-Value [Acc > NIR] : 0.006616
##
## Kappa : 0.5635
## Mcnemar's Test P-Value : 0.007190
##
## Sensitivity : 0.8187
## Specificity : 0.8036
## Pos Pred Value : 0.9225
## Neg Pred Value : 0.6081
## Prevalence : 0.7407
## Detection Rate : 0.6065
## Detection Prevalence : 0.6574
## Balanced Accuracy : 0.8112
##
## 'Positive' Class : 0
##
acc_lg <- confusionMatrix(test$Diabetes,pred_test$outcome)$overall['Accuracy']
# Get the ROC curve and the AUC
par(mfrow=c(1,1))
plot.roc(test$Diabetes,test_pred,percent=TRUE,col="#1c61b6",print.auc=TRUE,
main = "Area under the curve for Logistic Regression")
##
## Call:
## plot.roc.default(x = test$Diabetes, predictor = test_pred, percent = TRUE, col = "#1c61b6", print.auc = TRUE, main = "Area under the curve for Logistic Regression")
##
## Data: test_pred in 142 controls (test$Diabetes 0) < 74 cases (test$Diabetes 1).
## Area under the curve: 87.32%
For a Binomial GLM such as the case above, the likelihood of each observation is calculated using: \[(ny)πy(1−π)n−y\]
Because this is a probability based equation we use a link function, in this case logit to map the unit interval to a set of Real Numbers.
The difference in using a Bayesian approach is to specify prior distributions to the intercept and regression co-effecient terms. This helps us incorporate our beliefs of how the intercepts will turn out to be into the equation. We can specify this using any standard continuous distribution.
You can read more about Bayesian Logistic Regression here:
#Bayesian Logistic Regression
prior_dist <- student_t(df = 7, location = 0, scale = 2.5)
bayes_mod <- stan_glm(Diabetes ~ ., data = train,
family = binomial(link = "logit"),
prior = prior_dist, prior_intercept = prior_dist,
seed = 15689)
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
##
## Chain 1, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 1.15819 seconds (Warm-up)
## 1.25645 seconds (Sampling)
## 2.41463 seconds (Total)
##
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
##
## Chain 2, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 1.18468 seconds (Warm-up)
## 1.20156 seconds (Sampling)
## 2.38624 seconds (Total)
##
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
##
## Chain 3, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 1.22443 seconds (Warm-up)
## 0.976971 seconds (Sampling)
## 2.2014 seconds (Total)
##
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
##
## Chain 4, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 1.20294 seconds (Warm-up)
## 1.21009 seconds (Sampling)
## 2.41303 seconds (Total)
#Confidence Intervals for the predictors
posterior_interval(bayes_mod, prob = 0.95)
## 2.5% 97.5%
## (Intercept) -10.519516602 -6.768354355
## Pregnant 0.048804720 0.198337786
## Plasma_Glucose 0.024937244 0.042726827
## Dias_BP -0.027183802 0.014188527
## Triceps_Skin -0.017526337 0.016359135
## Serum_Insulin -0.002479761 0.001594718
## BMI 0.050794450 0.125171100
## DPF 0.426009682 1.802053392
## Age -0.013343434 0.033615016
#Residuals for the Bayesian Model
summary(residuals(bayes_mod))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.977900 -0.234900 -0.083110 0.000999 0.258700 0.940900
bayes_res <- data.frame(residuals(bayes_mod))
bayes_res$index <- seq.int(nrow(bayes_res))
colnames(bayes_res) <- "Residuals"
#Plotting the residuals
ggplot(data = bayes_res,aes(index,Residuals)) + geom_point() + ggtitle("Representation of randomness amongst Residuals")
ggplot(data = bayes_res,aes(Residuals)) + geom_density(aes(fill=Residuals)) +
ylab("Density") + ggtitle("Distribution of Residuals")
#Predicting Probabilities for the test data
pred <- posterior_linpred(bayes_mod, newdata = test, transform=TRUE)
fin_pred <- colMeans(pred)
test_prediction <- as.integer(fin_pred >= 0.5)
confusionMatrix(test$Diabetes,test_prediction)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 129 13
## 1 27 47
##
## Accuracy : 0.8148
## 95% CI : (0.7565, 0.8643)
## No Information Rate : 0.7222
## P-Value [Acc > NIR] : 0.00107
##
## Kappa : 0.5694
## Mcnemar's Test P-Value : 0.03983
##
## Sensitivity : 0.8269
## Specificity : 0.7833
## Pos Pred Value : 0.9085
## Neg Pred Value : 0.6351
## Prevalence : 0.7222
## Detection Rate : 0.5972
## Detection Prevalence : 0.6574
## Balanced Accuracy : 0.8051
##
## 'Positive' Class : 0
##
acc_bayes <- confusionMatrix(test$Diabetes,test_prediction)$overall['Accuracy']
plot.roc(test$Diabetes,fin_pred,percent=TRUE,col="#1c61b6", print.auc=TRUE,
main = "Area under the curve for Bayesian Logistic Regression")
##
## Call:
## plot.roc.default(x = test$Diabetes, predictor = fin_pred, percent = TRUE, col = "#1c61b6", print.auc = TRUE, main = "Area under the curve for Bayesian Logistic Regression")
##
## Data: fin_pred in 142 controls (test$Diabetes 0) < 74 cases (test$Diabetes 1).
## Area under the curve: 87.62%
#Decision Trees
set.seed(15689)
m_dt <- tree(Diabetes ~ ., data = train)
pred_dt <- predict(m_dt, train, type = "class")
confusionMatrix(train$Diabetes,pred_dt)[2:3]
## $table
## Reference
## Prediction 0 1
## 0 293 40
## 1 55 120
##
## $overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.129921e-01 5.773340e-01 7.763139e-01 8.459794e-01 6.850394e-01
## AccuracyPValue McnemarPValue
## 5.547677e-11 1.508972e-01
plot(m_dt)
text(m_dt, pretty = 0)
pred_dt_test <- predict(m_dt, test, type = "class")
confusionMatrix(test$Diabetes,pred_dt_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 108 34
## 1 30 44
##
## Accuracy : 0.7037
## 95% CI : (0.638, 0.7637)
## No Information Rate : 0.6389
## P-Value [Acc > NIR] : 0.02664
##
## Kappa : 0.3506
## Mcnemar's Test P-Value : 0.70766
##
## Sensitivity : 0.7826
## Specificity : 0.5641
## Pos Pred Value : 0.7606
## Neg Pred Value : 0.5946
## Prevalence : 0.6389
## Detection Rate : 0.5000
## Detection Prevalence : 0.6574
## Balanced Accuracy : 0.6734
##
## 'Positive' Class : 0
##
acc_dt <- confusionMatrix(pred_dt_test,test$Diabetes)$overall['Accuracy']
#Random Forest
set.seed(15689)
opt_mod <- tuneRF(train[-as.numeric(ncol(train))],train$Diabetes,ntreeTry = 150,
stepFactor = 2, improve = 0.05,trace = T, plot = T, doBest = F)
## mtry = 2 OOB error = 25.59%
## Searching left ...
## mtry = 1 OOB error = 26.57%
## -0.03846154 0.05
## Searching right ...
## mtry = 4 OOB error = 25%
## 0.02307692 0.05
mtry_fin <- opt_mod[as.numeric(which.min(opt_mod[,"OOBError"])),"mtry"]
rf_fin <- randomForest(Diabetes~.,data=train, mtry=mtry_fin, ntree=101,
keep.forest=TRUE, proximity=TRUE, importance=TRUE,test=test)
pred_test <- predict(rf_fin, newdata = test)
confusionMatrix(test$Diabetes,pred_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 117 25
## 1 23 51
##
## Accuracy : 0.7778
## 95% CI : (0.7164, 0.8314)
## No Information Rate : 0.6481
## P-Value [Acc > NIR] : 2.515e-05
##
## Kappa : 0.5098
## Mcnemar's Test P-Value : 0.8852
##
## Sensitivity : 0.8357
## Specificity : 0.6711
## Pos Pred Value : 0.8239
## Neg Pred Value : 0.6892
## Prevalence : 0.6481
## Detection Rate : 0.5417
## Detection Prevalence : 0.6574
## Balanced Accuracy : 0.7534
##
## 'Positive' Class : 0
##
acc_rf <- confusionMatrix(test$Diabetes,pred_test)$overall['Accuracy']
par(mfrow=c(1,2))
varImpPlot(rf_fin,type = 2,main = "Variable Importance",col = 'black')
plot(rf_fin,main = "Error vs no. of trees grown")
accuracy <- data.frame(Model=c("Logistic","Bayesian Logistic","Decision Tree","Random Forest"),Accuracy=c(acc_lg,acc_bayes,acc_dt,acc_rf))
ggplot(accuracy,aes(x=Model,y=Accuracy))+geom_bar(stat='identity')+theme_bw()+
ggtitle('Comparison of Model Accuracy')
From the output it is pretty evident that all the 4 models behave almost similar to each other with very little difference between the models. In order to improve accuracy we can use higher dimension classifiers such as Boosting algorithms, but for the purposes for this exercise I will not be doing it. This was just a simple workbook in order to help myself explore Bayesian Logistic Regression.