diabetes = read.csv("./diabetes_012_health_indicators_BRFSS2015.csv/diabetes_012_health_indicators_BRFSS2015.csv")str(diabetes)## 'data.frame': 253680 obs. of 22 variables:
## $ Diabetes_012 : num 0 0 0 0 0 0 0 0 2 0 ...
## $ HighBP : num 1 0 1 1 1 1 1 1 1 0 ...
## $ HighChol : num 1 0 1 0 1 1 0 1 1 0 ...
## $ CholCheck : num 1 0 1 1 1 1 1 1 1 1 ...
## $ BMI : num 40 25 28 27 24 25 30 25 30 24 ...
## $ Smoker : num 1 1 0 0 0 1 1 1 1 0 ...
## $ Stroke : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HeartDiseaseorAttack: num 0 0 0 0 0 0 0 0 1 0 ...
## $ PhysActivity : num 0 1 0 1 1 1 0 1 0 0 ...
## $ Fruits : num 0 0 1 1 1 1 0 0 1 0 ...
## $ Veggies : num 1 0 0 1 1 1 0 1 1 1 ...
## $ HvyAlcoholConsump : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AnyHealthcare : num 1 0 1 1 1 1 1 1 1 1 ...
## $ NoDocbcCost : num 0 1 1 0 0 0 0 0 0 0 ...
## $ GenHlth : num 5 3 5 2 2 2 3 3 5 2 ...
## $ MentHlth : num 18 0 30 0 3 0 0 0 30 0 ...
## $ PhysHlth : num 15 0 30 0 0 2 14 0 30 0 ...
## $ DiffWalk : num 1 0 1 0 0 0 0 1 1 0 ...
## $ Sex : num 0 0 0 0 0 1 0 0 0 1 ...
## $ Age : num 9 7 9 11 11 10 9 11 9 8 ...
## $ Education : num 4 6 4 3 5 6 6 4 5 4 ...
## $ Income : num 3 1 8 6 4 8 7 4 1 3 ...
summary(diabetes)## Diabetes_012 HighBP HighChol CholCheck
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :0.000 Median :0.0000 Median :1.0000
## Mean :0.2969 Mean :0.429 Mean :0.4241 Mean :0.9627
## 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :2.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## BMI Smoker Stroke HeartDiseaseorAttack
## Min. :12.00 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:24.00 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :27.00 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :28.38 Mean :0.4432 Mean :0.04057 Mean :0.09419
## 3rd Qu.:31.00 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :98.00 Max. :1.0000 Max. :1.00000 Max. :1.00000
## PhysActivity Fruits Veggies HvyAlcoholConsump
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.7565 Mean :0.6343 Mean :0.8114 Mean :0.0562
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## AnyHealthcare NoDocbcCost GenHlth MentHlth
## Min. :0.0000 Min. :0.00000 Min. :1.000 Min. : 0.000
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:2.000 1st Qu.: 0.000
## Median :1.0000 Median :0.00000 Median :2.000 Median : 0.000
## Mean :0.9511 Mean :0.08418 Mean :2.511 Mean : 3.185
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:3.000 3rd Qu.: 2.000
## Max. :1.0000 Max. :1.00000 Max. :5.000 Max. :30.000
## PhysHlth DiffWalk Sex Age
## Min. : 0.000 Min. :0.0000 Min. :0.0000 Min. : 1.000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 6.000
## Median : 0.000 Median :0.0000 Median :0.0000 Median : 8.000
## Mean : 4.242 Mean :0.1682 Mean :0.4403 Mean : 8.032
## 3rd Qu.: 3.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:10.000
## Max. :30.000 Max. :1.0000 Max. :1.0000 Max. :13.000
## Education Income
## Min. :1.00 Min. :1.000
## 1st Qu.:4.00 1st Qu.:5.000
## Median :5.00 Median :7.000
## Mean :5.05 Mean :6.054
## 3rd Qu.:6.00 3rd Qu.:8.000
## Max. :6.00 Max. :8.000
We can observe that target variable diabetes_012 has 3 classes. For the binary classification pre-diabetic group has be added to either non-diabetic or diabetic group. I would like to add it to the diabetic group since being told a doctor that patient being pre-diabetic can be a good indicator to quantify the risk of being diabetic in future.
diabetes_clean = diabetes
diabetes_clean$Diabetes_012 = as.factor(ifelse(diabetes_clean$Diabetes_012 == 0, 0, 1))diabetes_clean = diabetes_clean %>% rename(isDiabetic = Diabetes_012)All the variables except for BMI, MentlHlth, PhysHlth are categorical in nature : answering the question with 1 : Yes, 0: No. Apart from these variables that are numerical the rest of the variables are converted to factors.
diabetes_clean = diabetes_clean %>% mutate(across(!c("BMI","MentHlth","PhysHlth"), as.factor))sum(is.na(diabetes))## [1] 0
There are no missing values in the data.
sum(duplicated(diabetes_clean))## [1] 23968
There are 23968 duplicate roes in the data, that can be removed by using distinct function in the following step.
diabetes_clean = diabetes_clean %>% distinct()Exploratory data analysis needs to be performed to analyze the distribution of variables and their effect on the response variable.
distribution of diabetes
diabetes_clean %>%
ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ),
fill = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ))) +
geom_bar(stat="count", alpha = 0.8) +
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "Distrubution of diabetes", x= "", y= "", fill="is Diabetic")The data is imbalanced with 17% of the respondents having diabetics. Measures to be taken to balance the dataset. Techniques like Sampling methods, SMOTE, ROSE package over sampling or under sampling can be used.
High Bp vs is Diabetic
diabetes_clean %>%
ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ),
fill = ifelse(HighBP == 1, "High", "Normal"))) +
geom_bar(stat="count", alpha = 0.8) +
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "High BP vs Diabetes", x= "", y= "", fill="Blood Pressure")We can observe that High BP is most prevalent among people with diabetes.
High Cholesterol
diabetes_clean %>%
ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ),
fill = ifelse(HighChol == 1, "High", "Normal"))) +
geom_bar(stat="count", alpha = 0.8) +
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "High Cholesterol vs Diabetes", x= "", y= "", fill="High Cholesterol")More than 60% of the people who are diabetic have high cholesterol. There seems to be an association with prevalence of high Cholesterol and Diabetes.
Smoking vs diabetes
diabetes_clean %>%
ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ),
fill = ifelse(Smoker == 1, "Smoker", "Non-Smoker"))) +
geom_bar(stat="count", alpha = 0.8) +
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "Smoking habits vs Diabetes", x= "", y= "", fill="Smoker")The distribution of smoking habits seems to almost equal among people with or with out diabetes.
Ever told you had a Stroke In the past
diabetes_clean %>%
ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ),
fill = ifelse(Stroke == 1, "Yes", "No"))) +
geom_bar(stat="count", alpha = 0.8) +
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "Ever had a stroke", x= "", y= "", fill="Had Stroke in the past")Having a stroke in the past doesn’t seem to be indicator for the risk of being diabetic, as observed from the graph, less than 10% of the people who had stroke in the past where diabetic.
Alcohol Consumption
diabetes_clean %>%
ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ),
fill = ifelse(HvyAlcoholConsump == 1, "Heavy", "Normal"))) +
geom_bar(stat="count", alpha = 0.8) +
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "Alcohol Consumption vs Diabetes", x= "", y= "", fill="Alcohol Consumption")Very few people who consume high amount of alcohol have diabetes.
Prevalence of Diabetes Gender wise
diabetes_clean %>%
ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic", "Non Diabetic" ),
fill = ifelse(Sex == 0, "Female", "Male"))) +
geom_bar(stat="count", alpha = 0.8) +
stat_count(geom = "text", colour = "black", size = 3.5,
aes(label = paste("n = ", ..count..)),
position=position_stack(vjust=0.5)) +
labs(title = "Gender vs Diabetes", x= "", y= "", fill="Gender")Diabetes seem to be more prevalent among women than in men. But if the difference is significant can be further investigated.
BMI Vs Diabetic
diabetes_clean %>% ggplot(aes(x = ifelse(isDiabetic == 1, "Diabetic","Non-Diabetic"), y = BMI, fill = ifelse(Sex == 0, "Female", "Male"))) +
geom_boxplot() +
labs(title = "BMI vs diabetes", x = "is diabetic", y ="BMI", fill = "gender") From the plot above, both females and males who are diabetic have higher BMI than those who are not-diabetic
Comment : From the exploratory analysis above, HighChol, High BP and BMI seems to be useful predictors. I will run a random forest model on the data, and pick the important variables from variable importance plots and further investigate them.
chisq.test(diabetes_clean$Age, diabetes_clean$isDiabetic)##
## Pearson's Chi-squared test
##
## data: diabetes_clean$Age and diabetes_clean$isDiabetic
## X-squared = 8844.9, df = 12, p-value < 2.2e-16
Comment : Age is a categorical variable, where the age of the respondents is divided into categories as mentioned in the variable description. Chi-Squared test show that there is an association between age and being diabetic.
chisq.test(diabetes_clean$Income, diabetes_clean$isDiabetic)##
## Pearson's Chi-squared test
##
## data: diabetes_clean$Income and diabetes_clean$isDiabetic
## X-squared = 5153, df = 7, p-value < 2.2e-16
Comment : There is a significant association between Income and diabetes, as the p-value is significant.
chisq.test(diabetes_clean$Education, diabetes_clean$isDiabetic)##
## Pearson's Chi-squared test
##
## data: diabetes_clean$Education and diabetes_clean$isDiabetic
## X-squared = 2856.1, df = 5, p-value < 2.2e-16
Comment : Education level and Diabetes are associated with each other
chisq.test(diabetes_clean$GenHlth, diabetes_clean$isDiabetic)##
## Pearson's Chi-squared test
##
## data: diabetes_clean$GenHlth and diabetes_clean$isDiabetic
## X-squared = 18940, df = 4, p-value < 2.2e-16
Comment : General helath and dibatetes are associated with each other as per the above results.
Balancing the whole dataset
diabetes_bal <- ROSE(isDiabetic ~ ., data = diabetes_clean, seed = 1)$data
table(diabetes_bal$isDiabetic)##
## 0 1
## 114642 115070
Smaller sample
set.seed(123)
index.small = sample(1:nrow(diabetes_bal), size =20000)
small.df = diabetes_bal[index.small,]
nrow(small.df)## [1] 20000
Train-test split
set.seed(123)
index = sample(1:nrow(small.df), size = 0.7*nrow(small.df))
train.df = small.df[index,]
test.df = small.df[-index,]set.seed(123)
rf_model0 = randomForest(isDiabetic ~ ., data = train.df, importance = TRUE, ntree = 1000)
rf_model0##
## Call:
## randomForest(formula = isDiabetic ~ ., data = train.df, importance = TRUE, ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 24%
## Confusion matrix:
## 0 1 class.error
## 0 4854 2103 0.3022855
## 1 1257 5786 0.1784751
varImpPlot(rf_model0)From the variable importance plot above, we observe that BMI, PhysHlth, MentHlth, Age, GenHlth, Income, HighBP, Education and HighChol are more important than the other variables. I consider these variables for developing my further models.
Note After the extensive EDA, I pick BMI, PhysHlth, MentHlth, Age, GenHlth, Income, HighBP, Education and HighChol as my predictors for the models.
set.seed(123)
rf_model02 = randomForest(isDiabetic ~ BMI + PhysHlth + Age + MentHlth + GenHlth + Income + HighBP + Education + HighChol, data = train.df,ntree = 1000)
rf_model02##
## Call:
## randomForest(formula = isDiabetic ~ BMI + PhysHlth + Age + MentHlth + GenHlth + Income + HighBP + Education + HighChol, data = train.df, ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 25.19%
## Confusion matrix:
## 0 1 class.error
## 0 4845 2112 0.3035791
## 1 1414 5629 0.2007667
rf_pred <- predict(rf_model02, newdata = test.df) # predict class
caret::confusionMatrix(as.factor(rf_pred), as.factor(test.df$isDiabetic), positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2047 617
## 1 1000 2336
##
## Accuracy : 0.7305
## 95% CI : (0.7191, 0.7417)
## No Information Rate : 0.5078
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4619
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7911
## Specificity : 0.6718
## Pos Pred Value : 0.7002
## Neg Pred Value : 0.7684
## Prevalence : 0.4922
## Detection Rate : 0.3893
## Detection Prevalence : 0.5560
## Balanced Accuracy : 0.7314
##
## 'Positive' Class : 1
##
Comment : The accuracy of the model random forest model with selected predictors from DA is 73.05% and Sensitivity is 79.11% and Specificity is 67.18%.
mtry.values = seq(2,6,1)
nodesize.values = seq(2,8,2)
ntree.values = seq(1e3,6e3,1e3)
hyper_grid = expand.grid(mtry = mtry.values, nodesize = nodesize.values,
ntree = ntree.values)
#df with all combinations
oob_err = c()for (i in 1:nrow(hyper_grid)) {
set.seed(123)
# Train a Random Forest model
model = randomForest(isDiabetic ~
BMI +
PhysHlth +
Age +
MentHlth +
GenHlth +
Income +
HighBP +
Education +
HighChol,
data = train.df,
mtry = hyper_grid$mtry[i],
nodesize = hyper_grid$nodesize[i],
ntree = hyper_grid$ntree[i])
# Store OOB error for the model
oob_err[i] = model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error
opt_i = which.min(oob_err)
print(hyper_grid[opt_i,])## mtry nodesize ntree
## 52 3 6 3000
## Build randomforest model using best hyperparameter
set.seed(123)
rf_model03 = randomForest(isDiabetic ~ BMI +
PhysHlth +
Age +
MentHlth +
GenHlth +
Income +
HighBP +
Education +
HighChol,
data = train.df,
mtry = 3,
nodesize = 6,
ntree = 3000)
rf_model02##
## Call:
## randomForest(formula = isDiabetic ~ BMI + PhysHlth + Age + MentHlth + GenHlth + Income + HighBP + Education + HighChol, data = train.df, ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 25.19%
## Confusion matrix:
## 0 1 class.error
## 0 4845 2112 0.3035791
## 1 1414 5629 0.2007667
rf_pred_tuned <- predict(rf_model03, newdata = test.df) # predict class
caret::confusionMatrix(as.factor(rf_pred_tuned), as.factor(test.df$isDiabetic), positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2007 565
## 1 1040 2388
##
## Accuracy : 0.7325
## 95% CI : (0.7211, 0.7437)
## No Information Rate : 0.5078
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4662
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8087
## Specificity : 0.6587
## Pos Pred Value : 0.6966
## Neg Pred Value : 0.7803
## Prevalence : 0.4922
## Detection Rate : 0.3980
## Detection Prevalence : 0.5713
## Balanced Accuracy : 0.7337
##
## 'Positive' Class : 1
##
Comment : After tuning of hyper parameters, the optimal values for mtry = 3, nodesize = 6 and no.trees used are 3000. The accuracy is slightly increased to 73.5%, sensitivity showed a good improvement to 80.87%, which is desirable for the project as we want to detect maximum number of people with risk of diabetes to be positive.
dt_model0 = rpart(isDiabetic ~
BMI +
PhysHlth +
Age +
MentHlth +
GenHlth +
Income +
HighBP +
Education + HighChol , data = train.df)
rattle::fancyRpartPlot(dt_model0, sub = "")dt_pred = predict(dt_model0, test.df, type = "class")caret::confusionMatrix(as.factor(dt_pred), as.factor(test.df$isDiabetic), positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1882 616
## 1 1165 2337
##
## Accuracy : 0.7032
## 95% CI : (0.6914, 0.7147)
## No Information Rate : 0.5078
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4079
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7914
## Specificity : 0.6177
## Pos Pred Value : 0.6673
## Neg Pred Value : 0.7534
## Prevalence : 0.4922
## Detection Rate : 0.3895
## Detection Prevalence : 0.5837
## Balanced Accuracy : 0.7045
##
## 'Positive' Class : 1
##
Comment : Decision tree model used High BP, general Health, Age and BMI as the variables for splitting. The accuracy of model is 70.32% and the sensitivity is 79.14% and specificity is around 61.77%%, which is the lowest of all models.
glm_model0 = glm(isDiabetic ~
BMI +
PhysHlth +
Age +
MentHlth +
GenHlth +
Income +
HighBP +
Education + HighChol , data = train.df, family = binomial)
summary(glm_model0)##
## Call:
## glm(formula = isDiabetic ~ BMI + PhysHlth + Age + MentHlth +
## GenHlth + Income + HighBP + Education + HighChol, family = binomial,
## data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9254 -0.8663 0.2781 0.8701 2.7963
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.906400 0.653696 -9.035 < 2e-16 ***
## BMI 0.072708 0.003297 22.052 < 2e-16 ***
## PhysHlth -0.003314 0.002491 -1.330 0.1835
## Age2 0.598742 0.351572 1.703 0.0886 .
## Age3 0.661805 0.335266 1.974 0.0484 *
## Age4 1.298013 0.314863 4.122 3.75e-05 ***
## Age5 1.453834 0.308025 4.720 2.36e-06 ***
## Age6 1.615794 0.304749 5.302 1.15e-07 ***
## Age7 2.057432 0.301073 6.834 8.28e-12 ***
## Age8 2.100473 0.299568 7.012 2.35e-12 ***
## Age9 2.296418 0.299288 7.673 1.68e-14 ***
## Age10 2.495972 0.299314 8.339 < 2e-16 ***
## Age11 2.563069 0.300694 8.524 < 2e-16 ***
## Age12 2.574421 0.303185 8.491 < 2e-16 ***
## Age13 2.302764 0.302798 7.605 2.85e-14 ***
## MentHlth -0.003373 0.002646 -1.275 0.2024
## GenHlth2 0.620688 0.084919 7.309 2.69e-13 ***
## GenHlth3 1.275480 0.083858 15.210 < 2e-16 ***
## GenHlth4 1.715047 0.094705 18.109 < 2e-16 ***
## GenHlth5 1.999090 0.123734 16.156 < 2e-16 ***
## Income2 0.224025 0.117535 1.906 0.0566 .
## Income3 0.155125 0.113311 1.369 0.1710
## Income4 0.067414 0.109685 0.615 0.5388
## Income5 0.109467 0.108122 1.012 0.3113
## Income6 -0.082626 0.105195 -0.785 0.4322
## Income7 -0.144668 0.104859 -1.380 0.1677
## Income8 -0.123481 0.102857 -1.201 0.2299
## HighBP1 0.730369 0.043059 16.962 < 2e-16 ***
## Education2 0.330132 0.579585 0.570 0.5689
## Education3 -0.080823 0.569997 -0.142 0.8872
## Education4 -0.108121 0.564787 -0.191 0.8482
## Education5 -0.154203 0.565009 -0.273 0.7849
## Education6 -0.219631 0.565383 -0.388 0.6977
## HighChol1 0.541085 0.041353 13.084 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19408 on 13999 degrees of freedom
## Residual deviance: 14910 on 13966 degrees of freedom
## AIC: 14978
##
## Number of Fisher Scoring iterations: 5
vif(glm_model0)## GVIF Df GVIF^(1/(2*Df))
## BMI 1.103019 1 1.050247
## PhysHlth 1.617243 1 1.271709
## Age 1.272310 12 1.010085
## MentHlth 1.253706 1 1.119690
## GenHlth 1.761747 4 1.073354
## Income 1.484429 7 1.028618
## HighBP 1.109936 1 1.053535
## Education 1.314403 5 1.027715
## HighChol 1.058708 1 1.028935
VIF shows that there no multi-collinearity among the predictors.
# Compute optimal cutoff
PredProb = prediction(predict.glm(glm_model0, newdata = test.df, type = "response"), test.df$isDiabetic)
# Computing threshold for cutoff to best trade off sensitivity and specificity
plot(unlist(performance(PredProb,'sens')@x.values),
unlist(performance(PredProb,'sens')@y.values),
type='l', lwd=2, ylab = "", xlab = 'Cutoff')
mtext('Sensitivity',side=2)
mtext('Sensitivity vs. Specificity Plot', side=3)
# Second specificity in same plot
par(new=TRUE)
plot(unlist(performance(PredProb,'spec')@x.values),
unlist(performance(PredProb,'spec')@y.values),
type='l', lwd=2,col='red', ylab = "", xlab = 'Cutoff')
axis(4,at=seq(0,1,0.2))
mtext('Specificity',side=4, col='red')
par(new=TRUE)
min.diff <-which.min(abs(unlist(performance(PredProb, "sens")@y.values) - unlist(performance(PredProb, "spec")@y.values)))
min.x<-unlist(performance(PredProb, "sens")@x.values)[min.diff]
min.y<-unlist(performance(PredProb, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 3)## Predict classes and convert probabilities to binary
glm_prob = predict(glm_model0, newdata = test.df, type = "response")
glm_pred <- ifelse(glm_prob >= optimal, 1, 0)caret::confusionMatrix(as.factor(glm_pred), as.factor(test.df$isDiabetic), positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2182 838
## 1 865 2115
##
## Accuracy : 0.7162
## 95% CI : (0.7046, 0.7276)
## No Information Rate : 0.5078
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4323
##
## Mcnemar's Test P-Value : 0.5287
##
## Sensitivity : 0.7162
## Specificity : 0.7161
## Pos Pred Value : 0.7097
## Neg Pred Value : 0.7225
## Prevalence : 0.4922
## Detection Rate : 0.3525
## Detection Prevalence : 0.4967
## Balanced Accuracy : 0.7162
##
## 'Positive' Class : 1
##
Comment : As the dataset is already balanced using Random Sampling the optimal cutoff is around 0.5, and using the optimal cut-off gave similar values for all the metrics, around 71%
nb_model0 = naive_bayes(isDiabetic ~ BMI +
PhysHlth +
Age +
MentHlth +
GenHlth +
Income +
HighBP +
Education + HighChol, data = train.df, usekernel = TRUE)summary(nb_model0)##
## ================================== Naive Bayes ==================================
##
## - Call: naive_bayes.formula(formula = isDiabetic ~ BMI + PhysHlth + Age + MentHlth + GenHlth + Income + HighBP + Education + HighChol, data = train.df, usekernel = TRUE)
## - Laplace: 0
## - Classes: 2
## - Samples: 14000
## - Features: 9
## - Conditional distributions:
## - Bernoulli: 2
## - Categorical: 4
## - KDE: 3
## - Prior probabilities:
## - 0: 0.4969
## - 1: 0.5031
##
## ---------------------------------------------------------------------------------
par(mfrow = c(3,3))
plot(nb_model0)nb_prob = predict(nb_model0, test.df)caret::confusionMatrix(as.factor(nb_prob), test.df$isDiabetic, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2202 734
## 1 845 2219
##
## Accuracy : 0.7368
## 95% CI : (0.7255, 0.7479)
## No Information Rate : 0.5078
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4738
##
## Mcnemar's Test P-Value : 0.005636
##
## Sensitivity : 0.7514
## Specificity : 0.7227
## Pos Pred Value : 0.7242
## Neg Pred Value : 0.7500
## Prevalence : 0.4922
## Detection Rate : 0.3698
## Detection Prevalence : 0.5107
## Balanced Accuracy : 0.7371
##
## 'Positive' Class : 1
##
knn.train.df = train.df
knn.test.df = test.dfknn.train.df$isDiabetic = as.factor(ifelse(knn.train.df$isDiabetic == 0, "No", "Yes"))
knn.test.df$isDiabetic = as.factor(ifelse(knn.test.df$isDiabetic == 0, "No", "Yes"))trControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary)set.seed(222)
fit <- train(as.factor(isDiabetic) ~ BMI +
PhysHlth +
Age +
MentHlth +
GenHlth +
Income +
HighBP +
Education + HighChol,
data = knn.train.df,
method = 'knn',
tuneLength = 20,
trControl = trControl,
preProc = c("center", "scale"),
tuneGrid = expand.grid(k = 1:40))## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit## k-Nearest Neighbors
##
## 14000 samples
## 9 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (33), scaled (33)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 12599, 12600, 12601, 12600, 12600, 12600, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.6630655 0.6621623 0.6639687
## 2 0.7038774 0.6484117 0.6480167
## 3 0.7233019 0.6715068 0.6886712
## 4 0.7345224 0.6632161 0.6859752
## 5 0.7430578 0.6721773 0.7094965
## 6 0.7474626 0.6703108 0.7104901
## 7 0.7512544 0.6738065 0.7188679
## 8 0.7549653 0.6698786 0.7179222
## 9 0.7573892 0.6715537 0.7279558
## 10 0.7593258 0.6678166 0.7276235
## 11 0.7607588 0.6705474 0.7332556
## 12 0.7620187 0.6672406 0.7303209
## 13 0.7630385 0.6695422 0.7367117
## 14 0.7642851 0.6684863 0.7360020
## 15 0.7657793 0.6708355 0.7369483
## 16 0.7672934 0.6725602 0.7377526
## 17 0.7678895 0.6701170 0.7380385
## 18 0.7685013 0.6709301 0.7391271
## 19 0.7687713 0.6712660 0.7384648
## 20 0.7688340 0.6710739 0.7376132
## 21 0.7689050 0.6690619 0.7383697
## 22 0.7696907 0.6708836 0.7387951
## 23 0.7700928 0.6708825 0.7398838
## 24 0.7707228 0.6712662 0.7411618
## 25 0.7709329 0.6703556 0.7426771
## 26 0.7714022 0.6696840 0.7409265
## 27 0.7718694 0.6701636 0.7413519
## 28 0.7719155 0.6693012 0.7400258
## 29 0.7725613 0.6707384 0.7414943
## 30 0.7726863 0.6694930 0.7402155
## 31 0.7724386 0.6696844 0.7402160
## 32 0.7727348 0.6692057 0.7383231
## 33 0.7729177 0.6710740 0.7379917
## 34 0.7730669 0.6703542 0.7387491
## 35 0.7732995 0.6707376 0.7372354
## 36 0.7734806 0.6704027 0.7366193
## 37 0.7736052 0.6719350 0.7343005
## 38 0.7734961 0.6716489 0.7343009
## 39 0.7734656 0.6727499 0.7334002
## 40 0.7732068 0.6727981 0.7318385
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 37.
plot(fit)knn_preds = predict(fit, newdata = knn.test.df)
caret::confusionMatrix(as.factor(knn_preds), knn.test.df$isDiabetic,positive = "Yes")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1958 784
## Yes 1089 2169
##
## Accuracy : 0.6878
## 95% CI : (0.6759, 0.6995)
## No Information Rate : 0.5078
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3765
##
## Mcnemar's Test P-Value : 2.151e-12
##
## Sensitivity : 0.7345
## Specificity : 0.6426
## Pos Pred Value : 0.6657
## Neg Pred Value : 0.7141
## Prevalence : 0.4922
## Detection Rate : 0.3615
## Detection Prevalence : 0.5430
## Balanced Accuracy : 0.6886
##
## 'Positive' Class : Yes
##