3. Data Cleaning & Dealing with Data Types
diabetes$Outcome <- as.factor(diabetes$Outcome) # converting outcomes into categorical variable
glimpse(diabetes)
## Rows: 768
## Columns: 9
## $ Pregnancies <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure <int> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness <int> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin <int> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome <fct> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
5. Classification Models
5.1. Decision Tree
tree <- rpart(Outcome~., data=train_db)
prp(tree) # plotting the tree

predict_tree <- predict(tree, test_db, type = "class") # predicting the test data
cm_tree <- confusionMatrix(test_db$Outcome, predict_tree) # confusion matrix
print(cm_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 89 11
## 1 27 27
##
## Accuracy : 0.7532
## 95% CI : (0.6774, 0.8191)
## No Information Rate : 0.7532
## P-Value [Acc > NIR] : 0.54343
##
## Kappa : 0.4185
##
## Mcnemar's Test P-Value : 0.01496
##
## Sensitivity : 0.7672
## Specificity : 0.7105
## Pos Pred Value : 0.8900
## Neg Pred Value : 0.5000
## Prevalence : 0.7532
## Detection Rate : 0.5779
## Detection Prevalence : 0.6494
## Balanced Accuracy : 0.7389
##
## 'Positive' Class : 0
##
5.2. Random Forest
rf <- randomForest(Outcome ~ ., train_db, mtry=4, ntree=2000, importance=T)
plot(rf) # plotting the random forest

predict_forest <- predict(rf, test_db, type="response") # predicting the test data
cm_forest <- confusionMatrix(test_db$Outcome, predict_forest) # confusion matrix
print(cm_forest)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 84 16
## 1 18 36
##
## Accuracy : 0.7792
## 95% CI : (0.7054, 0.842)
## No Information Rate : 0.6623
## P-Value [Acc > NIR] : 0.001047
##
## Kappa : 0.511
##
## Mcnemar's Test P-Value : 0.863832
##
## Sensitivity : 0.8235
## Specificity : 0.6923
## Pos Pred Value : 0.8400
## Neg Pred Value : 0.6667
## Prevalence : 0.6623
## Detection Rate : 0.5455
## Detection Prevalence : 0.6494
## Balanced Accuracy : 0.7579
##
## 'Positive' Class : 0
##
5.3. Support Vector Machine
svm_db <- train(Outcome~., data=train_db, method="svmLinear")
print(svm_db)
## Support Vector Machines with Linear Kernel
##
## 614 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 614, 614, 614, 614, 614, 614, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7726804 0.467728
##
## Tuning parameter 'C' was held constant at a value of 1
predict_svm <- predict(svm_db, test_db) # predictin the outcomes
cm_svm <- confusionMatrix(test_db$Outcome, predict_svm) # confusion matrix
print(cm_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 85 15
## 1 23 31
##
## Accuracy : 0.7532
## 95% CI : (0.6774, 0.8191)
## No Information Rate : 0.7013
## P-Value [Acc > NIR] : 0.09168
##
## Kappa : 0.439
##
## Mcnemar's Test P-Value : 0.25614
##
## Sensitivity : 0.7870
## Specificity : 0.6739
## Pos Pred Value : 0.8500
## Neg Pred Value : 0.5741
## Prevalence : 0.7013
## Detection Rate : 0.5519
## Detection Prevalence : 0.6494
## Balanced Accuracy : 0.7305
##
## 'Positive' Class : 0
##
5.4. K-Nearest Neighbour
knn_db <- train(Outcome ~ ., data=train_db, method="knn")
plot(knn_db) # plotting the knn

predict_knn <- predict(knn_db, test_db) # predicting the outcomes
cm_knn <- confusionMatrix(test_db$Outcome, predict_knn) # confusion matrix
print(cm_knn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 85 15
## 1 27 27
##
## Accuracy : 0.7273
## 95% CI : (0.6497, 0.7958)
## No Information Rate : 0.7273
## P-Value [Acc > NIR] : 0.54143
##
## Kappa : 0.3689
##
## Mcnemar's Test P-Value : 0.08963
##
## Sensitivity : 0.7589
## Specificity : 0.6429
## Pos Pred Value : 0.8500
## Neg Pred Value : 0.5000
## Prevalence : 0.7273
## Detection Rate : 0.5519
## Detection Prevalence : 0.6494
## Balanced Accuracy : 0.7009
##
## 'Positive' Class : 0
##
5.5. Accuracy Table
# Saving accuracies in variables
accuracy_tree <- cm_tree$overall['Accuracy'] # Decision Tree
accuracy_forest <- cm_forest$overall['Accuracy'] # Random Forest
accuracy_svm <- cm_svm$overall['Accuracy'] # SVM
accuracy_knn <- cm_knn$overall['Accuracy'] # KNN
methods <- c("Decision Tree", "Random Forest", "SVM", "KNN")
accuracies <- c(accuracy_tree, accuracy_forest, accuracy_svm, accuracy_knn)
accuracy_table <- data.frame(Method = methods, Accuracy = round(accuracies*100,2))
accuracy_table <- accuracy_table %>% arrange(desc(Accuracy))
print(accuracy_table)
## Method Accuracy
## 1 Random Forest 77.92
## 2 Decision Tree 75.32
## 3 SVM 75.32
## 4 KNN 72.73
# Accuracy Plot
accuracy_plot <- accuracy_table %>%
ggplot(aes(Method, Accuracy)) +
geom_bar(stat = "identity", fill="black") +
theme_minimal()
print(accuracy_plot)
