SMOTE and ML

In this excercise, we use the famous PIMA diabetes dataset to demonstrate using SMOTE and ML algorithms.

# Load necessary libraries
library(smotefamily)
library(caret)

Loading required package: ggplot2

Loading required package: lattice

library(mlbench)

# Load the dataset
data(PimaIndiansDiabetes)
data <- PimaIndiansDiabetes

# Check the class distribution
table(data$diabetes)


neg pos 
500 268

#Step 1: Split the data into training and testing sets (80:20)
set.seed(123)  # For reproducibility
split_index <- createDataPartition(data$diabetes, p = 0.8, list = FALSE)
train_data <- data[split_index, ]
test_data <- data[-split_index, ]

# Check the class distribution in the training set
table(train_data$diabetes)


neg pos 
400 215

str(train_data)

'data.frame':   615 obs. of  9 variables:
 $ pregnant: num  6 8 1 5 3 10 2 8 4 10 ...
 $ glucose : num  148 183 89 116 78 115 197 125 110 168 ...
 $ pressure: num  72 64 66 74 50 0 70 96 92 74 ...
 $ triceps : num  35 0 23 0 32 0 45 0 0 0 ...
 $ insulin : num  0 0 94 0 88 0 543 0 0 0 ...
 $ mass    : num  33.6 23.3 28.1 25.6 31 35.3 30.5 0 37.6 38 ...
 $ pedigree: num  0.627 0.672 0.167 0.201 0.248 0.134 0.158 0.232 0.191 0.537 ...
 $ age     : num  50 32 21 30 26 29 53 54 30 34 ...
 $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 1 2 1 2 2 1 2 ...

str(test_data)

'data.frame':   153 obs. of  9 variables:
 $ pregnant: num  1 0 0 11 7 6 4 3 5 8 ...
 $ glucose : num  85 137 118 143 147 92 103 180 88 176 ...
 $ pressure: num  66 40 84 94 76 92 60 64 66 90 ...
 $ triceps : num  29 35 47 33 0 0 33 25 21 34 ...
 $ insulin : num  0 168 230 146 0 0 192 70 23 300 ...
 $ mass    : num  26.6 43.1 45.8 36.6 39.4 19.9 24 34 24.4 33.7 ...
 $ pedigree: num  0.351 2.288 0.551 0.254 0.257 ...
 $ age     : num  31 33 31 51 43 28 33 26 30 58 ...
 $ diabetes: Factor w/ 2 levels "neg","pos": 1 2 2 2 2 1 1 1 1 2 ...

# Step 2: Apply SMOTE only to the training set
# SMOTE requires the target variable to be numeric (0 and 1)

# Apply SMOTE using smotefamily
smote_output1 <- SMOTE(X = train_data[, -which(names(train_data) == "diabetes")],  
                      target = train_data$diabetes,  
                      K = 5,  
                      dup_size = 1)  # Adjust duplication size: 1 will double the minority class instances. 1 synthetic sample per minority class sample is generated.

# Convert to dataframe and ensure class is a factor
balanced_train <- data.frame(smote_output1$data)

table(balanced_train$class)


neg pos 
400 430

balanced_train$class = factor(balanced_train$class)

set.seed(123)  # Ensure reproducibility

mycontrol <- trainControl(
  method = "cv",  # Cross-validation
  number = 10,    # 10-fold
  classProbs = TRUE,  # Enable probability estimation
  summaryFunction = twoClassSummary  # Compute ROC, Sensitivity, Specificity
)

Logistic Regression

logistic_model <- train(
  class ~ .,  
  data = balanced_train,  
  method = "glm",  
  family = "binomial",  
  trControl = mycontrol
)

Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.

# Print model summary
print(logistic_model)

Generalized Linear Model 

830 samples
  8 predictor
  2 classes: 'neg', 'pos' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ... 
Resampling results:

  ROC        Sens   Spec     
  0.8286628  0.745  0.7534884

# Predict class labels
predictions <- predict(logistic_model, newdata = test_data)

# Predict probabilities




#pred_prob <- predict(logistic_model, newdata = test_data, type = "prob")

predictions <- predict(logistic_model, newdata = test_data)

#predictions

confusionMatrix(predictions, test_data$diabetes, positive = "pos")

Confusion Matrix and Statistics

          Reference
Prediction neg pos
       neg  81  13
       pos  19  40
                                          
               Accuracy : 0.7908          
                 95% CI : (0.7178, 0.8523)
    No Information Rate : 0.6536          
    P-Value [Acc > NIR] : 0.0001499       
                                          
                  Kappa : 0.5501          
                                          
 Mcnemar's Test P-Value : 0.3767591       
                                          
            Sensitivity : 0.7547          
            Specificity : 0.8100          
         Pos Pred Value : 0.6780          
         Neg Pred Value : 0.8617          
             Prevalence : 0.3464          
         Detection Rate : 0.2614          
   Detection Prevalence : 0.3856          
      Balanced Accuracy : 0.7824          
                                          
       'Positive' Class : pos

pred_prob <- predict(logistic_model, newdata = test_data, type = "prob")

library(pROC)

Type 'citation("pROC")' for a citation.


Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

roc_curve <- roc(test_data$diabetes, pred_prob[,2])  # Use probability of class 1

Setting levels: control = neg, case = pos

Setting direction: controls < cases

auc(roc_curve)  # Get AUC value

Area under the curve: 0.8949

plot(roc_curve, col = "blue", main = "ROC Curve for Logistic Regression",
     legacy.axes = TRUE)

# Add AUC to the plot
auc_value <- auc(roc_curve)
legend("bottomright", legend = paste("LR: AUC =", round(auc_value, 3)), col = "blue", lwd = 3)

K-nearest neighbors

knn_model <- train(
  class ~ .,  
  data = balanced_train,  
  method = "knn",  
  trControl = mycontrol,
  tuneLength = 10
)

Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.

# Print model summary
print(knn_model)

k-Nearest Neighbors 

830 samples
  8 predictor
  2 classes: 'neg', 'pos' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ... 
Resampling results across tuning parameters:

  k   ROC        Sens    Spec     
   5  0.8124128  0.6450  0.8279070
   7  0.8066570  0.6250  0.8116279
   9  0.8093314  0.6300  0.8000000
  11  0.8039244  0.6300  0.8023256
  13  0.8037791  0.6250  0.7976744
  15  0.8027907  0.6350  0.8046512
  17  0.8016860  0.6150  0.7976744
  19  0.7999128  0.6225  0.7953488
  21  0.7949128  0.6200  0.8000000
  23  0.7947674  0.6050  0.8162791

ROC was used to select the optimal model using the largest value.
The final value used for the model was k = 5.

knn_model$bestTune

  k
1 5

print(knn_model)

k-Nearest Neighbors 

830 samples
  8 predictor
  2 classes: 'neg', 'pos' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ... 
Resampling results across tuning parameters:

  k   ROC        Sens    Spec     
   5  0.8124128  0.6450  0.8279070
   7  0.8066570  0.6250  0.8116279
   9  0.8093314  0.6300  0.8000000
  11  0.8039244  0.6300  0.8023256
  13  0.8037791  0.6250  0.7976744
  15  0.8027907  0.6350  0.8046512
  17  0.8016860  0.6150  0.7976744
  19  0.7999128  0.6225  0.7953488
  21  0.7949128  0.6200  0.8000000
  23  0.7947674  0.6050  0.8162791

ROC was used to select the optimal model using the largest value.
The final value used for the model was k = 5.

# Predict class labels
knn_predictions <- predict(knn_model, newdata = test_data)

confusionMatrix(knn_predictions, test_data$diabetes,  positive = "pos")

Confusion Matrix and Statistics

          Reference
Prediction neg pos
       neg  68  11
       pos  32  42
                                          
               Accuracy : 0.719           
                 95% CI : (0.6407, 0.7886)
    No Information Rate : 0.6536          
    P-Value [Acc > NIR] : 0.051516        
                                          
                  Kappa : 0.4322          
                                          
 Mcnemar's Test P-Value : 0.002289        
                                          
            Sensitivity : 0.7925          
            Specificity : 0.6800          
         Pos Pred Value : 0.5676          
         Neg Pred Value : 0.8608          
             Prevalence : 0.3464          
         Detection Rate : 0.2745          
   Detection Prevalence : 0.4837          
      Balanced Accuracy : 0.7362          
                                          
       'Positive' Class : pos

pred_prob_knn <- predict(knn_model, newdata = test_data, type = "prob")

roc_curve <- roc(test_data$diabetes, pred_prob_knn[,2])  # Use probability of class 1

Setting levels: control = neg, case = pos

Setting direction: controls < cases

auc(roc_curve)  # Get AUC value

Area under the curve: 0.794

plot(roc_curve, col = "blue", main = "ROC Curve for KNN",
     legacy.axes = TRUE)

# Add AUC to the plot
auc_value <- auc(roc_curve)
legend("bottomright", legend = paste("KNN: AUC =", round(auc_value, 3)), col = "blue", lwd = 3)

Random Forest

library(ranger)

set.seed(123)  # For reproducibility

# Define tuning grid
tune_grid <- expand.grid(
  mtry = c(2, 4, 6, 8),  # Number of features per split
  splitrule = c("gini", "extratrees"),  # Splitting criterion
  min.node.size = c(1, 3, 5)  # Minimum samples per terminal node
)

rf_model <- train(
  class ~ .,  
  data = balanced_train,  
  method = "ranger",  # Fast Random Forest
  trControl = mycontrol,  
  tuneGrid = tune_grid,  # Use custom hyperparameter tuning grid
  num.trees = 500,  # Number of trees
  importance = "impurity"  # Compute variable importance
)

Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.

# Print the best tuned parameters
print(rf_model$bestTune)

  mtry  splitrule min.node.size
4    2 extratrees             1

print(rf_model)

Random Forest 

830 samples
  8 predictor
  2 classes: 'neg', 'pos' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ... 
Resampling results across tuning parameters:

  mtry  splitrule   min.node.size  ROC        Sens    Spec     
  2     gini        1              0.9042151  0.7625  0.8790698
  2     gini        3              0.9033140  0.7525  0.8790698
  2     gini        5              0.8991279  0.7500  0.8604651
  2     extratrees  1              0.9177907  0.7675  0.8930233
  2     extratrees  3              0.9143605  0.7575  0.8930233
  2     extratrees  5              0.9053488  0.7650  0.8790698
  4     gini        1              0.8993023  0.7575  0.8674419
  4     gini        3              0.8949419  0.7500  0.8697674
  4     gini        5              0.8947674  0.7575  0.8604651
  4     extratrees  1              0.9145930  0.7675  0.8813953
  4     extratrees  3              0.9114244  0.7550  0.8883721
  4     extratrees  5              0.9076163  0.7725  0.8767442
  6     gini        1              0.8975872  0.7525  0.8534884
  6     gini        3              0.8941279  0.7550  0.8581395
  6     gini        5              0.8920058  0.7475  0.8558140
  6     extratrees  1              0.9136628  0.7775  0.8813953
  6     extratrees  3              0.9110465  0.7625  0.8790698
  6     extratrees  5              0.9070930  0.7725  0.8697674
  8     gini        1              0.8938372  0.7575  0.8534884
  8     gini        3              0.8919186  0.7475  0.8488372
  8     gini        5              0.8916860  0.7525  0.8488372
  8     extratrees  1              0.9099709  0.7625  0.8790698
  8     extratrees  3              0.9098547  0.7600  0.8767442
  8     extratrees  5              0.9066860  0.7825  0.8744186

ROC was used to select the optimal model using the largest value.
The final values used for the model were mtry = 2, splitrule = extratrees
 and min.node.size = 1.

plot(rf_model)

varImp(rf_model)

ranger variable importance

         Overall
glucose  100.000
age       50.334
mass      38.876
pedigree  26.692
pregnant  24.997
pressure  15.350
triceps    6.924
insulin    0.000

rf_predictions <- predict(rf_model, newdata = test_data)
rf_pred_prob <- predict(rf_model, newdata = test_data, type = "prob")

confusionMatrix(rf_predictions, test_data$diabetes, positive = "pos")

Confusion Matrix and Statistics

          Reference
Prediction neg pos
       neg  80  11
       pos  20  42
                                         
               Accuracy : 0.7974         
                 95% CI : (0.7249, 0.858)
    No Information Rate : 0.6536         
    P-Value [Acc > NIR] : 7.169e-05      
                                         
                  Kappa : 0.5697         
                                         
 Mcnemar's Test P-Value : 0.1508         
                                         
            Sensitivity : 0.7925         
            Specificity : 0.8000         
         Pos Pred Value : 0.6774         
         Neg Pred Value : 0.8791         
             Prevalence : 0.3464         
         Detection Rate : 0.2745         
   Detection Prevalence : 0.4052         
      Balanced Accuracy : 0.7962         
                                         
       'Positive' Class : pos

library(pROC)

rf_roc <- roc(test_data$diabetes, rf_pred_prob[,2])  # Class 1 probabilities

Setting levels: control = neg, case = pos

Setting direction: controls < cases

plot(rf_roc, legacy.axes = TRUE, col = "green", lwd = 2, main = "ROC Curve - Random Forest")
auc_value <- auc(rf_roc)
legend("bottomright", legend = paste("AUC =", round(auc_value, 3)), col = "green", lwd = 2)

library(caret)
library(xgboost)
library(pROC)  # For ROC curve analysis

XGBoost with Default hyperparameters

set.seed(123)
grid_default <- expand.grid(
  nrounds = 100,
  max_depth = 6,
  eta = 0.3,
  gamma = 0,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

xgb_model <- train(
  class ~ .,  
  data = balanced_train,  
  method = "xgbTree",  # XGBoost classifier
  trControl = mycontrol,  # Use the same train control
  tuneGrid = grid_default,  # Use hyperparameter tuning grid
  metric = "ROC"  # Optimize for accuracy
)

# Print the best tuned parameters
print(xgb_model)

eXtreme Gradient Boosting 

830 samples
  8 predictor
  2 classes: 'neg', 'pos' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ... 
Resampling results:

  ROC        Sens    Spec     
  0.8878488  0.7575  0.8395349

Tuning parameter 'nrounds' was held constant at a value of 100
Tuning
 held constant at a value of 1
Tuning parameter 'subsample' was held
 constant at a value of 1

varImp(xgb_model)

xgbTree variable importance

         Overall
glucose   100.00
mass       49.28
age        38.15
pedigree   27.33
pressure   14.75
pregnant   12.57
insulin     0.95
triceps     0.00

xgb_predictions <- predict(xgb_model, newdata = test_data)
xgb_pred_prob <- predict(xgb_model, newdata = test_data, type = "prob")

confusionMatrix(xgb_predictions, test_data$diabetes, mode = "everything", positive = "pos")

Confusion Matrix and Statistics

          Reference
Prediction neg pos
       neg  82  16
       pos  18  37
                                          
               Accuracy : 0.7778          
                 95% CI : (0.7036, 0.8409)
    No Information Rate : 0.6536          
    P-Value [Acc > NIR] : 0.000586        
                                          
                  Kappa : 0.5136          
                                          
 Mcnemar's Test P-Value : 0.863832        
                                          
            Sensitivity : 0.6981          
            Specificity : 0.8200          
         Pos Pred Value : 0.6727          
         Neg Pred Value : 0.8367          
              Precision : 0.6727          
                 Recall : 0.6981          
                     F1 : 0.6852          
             Prevalence : 0.3464          
         Detection Rate : 0.2418          
   Detection Prevalence : 0.3595          
      Balanced Accuracy : 0.7591          
                                          
       'Positive' Class : pos

xgb_roc <- roc(test_data$diabetes, rf_pred_prob[,2])  # Class 1 probabilities

Setting levels: control = neg, case = pos

Setting direction: controls < cases

plot(xgb_roc, legacy.axes = TRUE, col = "green", lwd = 2, main = "ROC Curve - XGBoost")
auc_value <- auc(xgb_roc)
legend("bottomright", legend = paste("AUC =", round(auc_value, 3)), col = "green", lwd = 2)

# https://www.kaggle.com/code/pelkoja/visual-xgboost-tuning-with-caret

library(PRROC)

# Compute PR Curve using PRROC package
pr_curve <- pr.curve(scores.class0 = xgb_pred_prob[,2], weights.class0 = as.numeric(test_data$diabetes) - 1, curve = TRUE)

# Plot Precision-Recall Curve
plot(pr_curve, col = "red", main = "Precision-Recall Curve - XGB")