Comparison of Classification Models’ Performance

1. Importing the Libraries
2. Importing the Diabetes Dataset
3. Data Cleaning & Dealing with Data Types
4. Splitting the Dataset into Train and Test
5. Classification Models

1. Importing the Libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot)

## corrplot 0.92 loaded

library(caTools)
library(rpart)
library(rpart.plot)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(class)

2. Importing the Diabetes Dataset

diabetes <- read.csv("~/Workbooks/diabetes.csv", stringsAsFactors=TRUE)
View(diabetes)

3. Data Cleaning & Dealing with Data Types

diabetes$Outcome <- as.factor(diabetes$Outcome) # converting outcomes into categorical variable
glimpse(diabetes)

## Rows: 768
## Columns: 9
## $ Pregnancies              <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose                  <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure            <int> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness            <int> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin                  <int> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI                      <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age                      <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome                  <fct> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …

4. Splitting the Dataset into Train and Test

set.seed(123)
split <- sample.split(diabetes$Outcome, SplitRatio = 0.8)
train_db <- subset(diabetes, split == T)
test_db <- subset(diabetes, split == F)
attach(diabetes)

5. Classification Models

5.1. Decision Tree

tree <- rpart(Outcome~., data=train_db)
prp(tree) # plotting the tree

predict_tree <- predict(tree, test_db, type = "class") # predicting the test data

cm_tree <- confusionMatrix(test_db$Outcome, predict_tree) # confusion matrix
print(cm_tree)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 89 11
##          1 27 27
##                                           
##                Accuracy : 0.7532          
##                  95% CI : (0.6774, 0.8191)
##     No Information Rate : 0.7532          
##     P-Value [Acc > NIR] : 0.54343         
##                                           
##                   Kappa : 0.4185          
##                                           
##  Mcnemar's Test P-Value : 0.01496         
##                                           
##             Sensitivity : 0.7672          
##             Specificity : 0.7105          
##          Pos Pred Value : 0.8900          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.7532          
##          Detection Rate : 0.5779          
##    Detection Prevalence : 0.6494          
##       Balanced Accuracy : 0.7389          
##                                           
##        'Positive' Class : 0               
##

5.2. Random Forest

rf <- randomForest(Outcome ~ ., train_db, mtry=4, ntree=2000, importance=T)
plot(rf) # plotting the random forest

predict_forest <- predict(rf, test_db, type="response") # predicting the test data

cm_forest <- confusionMatrix(test_db$Outcome, predict_forest) # confusion matrix
print(cm_forest)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 84 16
##          1 18 36
##                                          
##                Accuracy : 0.7792         
##                  95% CI : (0.7054, 0.842)
##     No Information Rate : 0.6623         
##     P-Value [Acc > NIR] : 0.001047       
##                                          
##                   Kappa : 0.511          
##                                          
##  Mcnemar's Test P-Value : 0.863832       
##                                          
##             Sensitivity : 0.8235         
##             Specificity : 0.6923         
##          Pos Pred Value : 0.8400         
##          Neg Pred Value : 0.6667         
##              Prevalence : 0.6623         
##          Detection Rate : 0.5455         
##    Detection Prevalence : 0.6494         
##       Balanced Accuracy : 0.7579         
##                                          
##        'Positive' Class : 0              
##

5.3. Support Vector Machine

svm_db <- train(Outcome~., data=train_db, method="svmLinear")
print(svm_db)

## Support Vector Machines with Linear Kernel 
## 
## 614 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 614, 614, 614, 614, 614, 614, ... 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.7726804  0.467728
## 
## Tuning parameter 'C' was held constant at a value of 1

predict_svm <- predict(svm_db, test_db) # predictin the outcomes

cm_svm <- confusionMatrix(test_db$Outcome, predict_svm) # confusion matrix
print(cm_svm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 85 15
##          1 23 31
##                                           
##                Accuracy : 0.7532          
##                  95% CI : (0.6774, 0.8191)
##     No Information Rate : 0.7013          
##     P-Value [Acc > NIR] : 0.09168         
##                                           
##                   Kappa : 0.439           
##                                           
##  Mcnemar's Test P-Value : 0.25614         
##                                           
##             Sensitivity : 0.7870          
##             Specificity : 0.6739          
##          Pos Pred Value : 0.8500          
##          Neg Pred Value : 0.5741          
##              Prevalence : 0.7013          
##          Detection Rate : 0.5519          
##    Detection Prevalence : 0.6494          
##       Balanced Accuracy : 0.7305          
##                                           
##        'Positive' Class : 0               
##

5.4. K-Nearest Neighbour

knn_db <- train(Outcome ~ ., data=train_db, method="knn")
plot(knn_db) # plotting the knn

predict_knn <- predict(knn_db, test_db) # predicting the outcomes

cm_knn <- confusionMatrix(test_db$Outcome, predict_knn) # confusion matrix
print(cm_knn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 85 15
##          1 27 27
##                                           
##                Accuracy : 0.7273          
##                  95% CI : (0.6497, 0.7958)
##     No Information Rate : 0.7273          
##     P-Value [Acc > NIR] : 0.54143         
##                                           
##                   Kappa : 0.3689          
##                                           
##  Mcnemar's Test P-Value : 0.08963         
##                                           
##             Sensitivity : 0.7589          
##             Specificity : 0.6429          
##          Pos Pred Value : 0.8500          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.7273          
##          Detection Rate : 0.5519          
##    Detection Prevalence : 0.6494          
##       Balanced Accuracy : 0.7009          
##                                           
##        'Positive' Class : 0               
##

5.5. Accuracy Table

# Saving accuracies in variables
accuracy_tree <- cm_tree$overall['Accuracy'] # Decision Tree
accuracy_forest <- cm_forest$overall['Accuracy'] # Random Forest
accuracy_svm <- cm_svm$overall['Accuracy'] # SVM
accuracy_knn <- cm_knn$overall['Accuracy'] # KNN

methods <- c("Decision Tree", "Random Forest", "SVM", "KNN")
accuracies <- c(accuracy_tree, accuracy_forest, accuracy_svm, accuracy_knn)
accuracy_table <- data.frame(Method = methods, Accuracy = round(accuracies*100,2))
accuracy_table <- accuracy_table %>% arrange(desc(Accuracy))
print(accuracy_table)

##          Method Accuracy
## 1 Random Forest    77.92
## 2 Decision Tree    75.32
## 3           SVM    75.32
## 4           KNN    72.73

# Accuracy Plot
accuracy_plot <- accuracy_table %>% 
  ggplot(aes(Method, Accuracy)) +
  geom_bar(stat = "identity", fill="black") + 
  theme_minimal()
print(accuracy_plot)

Comparison of Classification Models’ Performance

Nazir Ali Khan

2023-08-05

1. Importing the Libraries

2. Importing the Diabetes Dataset

3. Data Cleaning & Dealing with Data Types

4. Splitting the Dataset into Train and Test

5. Classification Models

5.1. Decision Tree

5.2. Random Forest

5.3. Support Vector Machine

5.4. K-Nearest Neighbour

5.5. Accuracy Table