# Load necessary libraries
library(smotefamily)
library(caret)Loading required package: ggplot2
Loading required package: lattice
library(mlbench)In this excercise, we use the famous PIMA diabetes dataset to demonstrate using SMOTE and ML algorithms.
# Load necessary libraries
library(smotefamily)
library(caret)Loading required package: ggplot2
Loading required package: lattice
library(mlbench)# Load the dataset
data(PimaIndiansDiabetes)
data <- PimaIndiansDiabetes# Check the class distribution
table(data$diabetes)
neg pos
500 268
#Step 1: Split the data into training and testing sets (80:20)
set.seed(123) # For reproducibility
split_index <- createDataPartition(data$diabetes, p = 0.8, list = FALSE)
train_data <- data[split_index, ]
test_data <- data[-split_index, ]# Check the class distribution in the training set
table(train_data$diabetes)
neg pos
400 215
str(train_data)'data.frame': 615 obs. of 9 variables:
$ pregnant: num 6 8 1 5 3 10 2 8 4 10 ...
$ glucose : num 148 183 89 116 78 115 197 125 110 168 ...
$ pressure: num 72 64 66 74 50 0 70 96 92 74 ...
$ triceps : num 35 0 23 0 32 0 45 0 0 0 ...
$ insulin : num 0 0 94 0 88 0 543 0 0 0 ...
$ mass : num 33.6 23.3 28.1 25.6 31 35.3 30.5 0 37.6 38 ...
$ pedigree: num 0.627 0.672 0.167 0.201 0.248 0.134 0.158 0.232 0.191 0.537 ...
$ age : num 50 32 21 30 26 29 53 54 30 34 ...
$ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 1 2 1 2 2 1 2 ...
str(test_data)'data.frame': 153 obs. of 9 variables:
$ pregnant: num 1 0 0 11 7 6 4 3 5 8 ...
$ glucose : num 85 137 118 143 147 92 103 180 88 176 ...
$ pressure: num 66 40 84 94 76 92 60 64 66 90 ...
$ triceps : num 29 35 47 33 0 0 33 25 21 34 ...
$ insulin : num 0 168 230 146 0 0 192 70 23 300 ...
$ mass : num 26.6 43.1 45.8 36.6 39.4 19.9 24 34 24.4 33.7 ...
$ pedigree: num 0.351 2.288 0.551 0.254 0.257 ...
$ age : num 31 33 31 51 43 28 33 26 30 58 ...
$ diabetes: Factor w/ 2 levels "neg","pos": 1 2 2 2 2 1 1 1 1 2 ...
# Step 2: Apply SMOTE only to the training set
# SMOTE requires the target variable to be numeric (0 and 1)# Apply SMOTE using smotefamily
smote_output1 <- SMOTE(X = train_data[, -which(names(train_data) == "diabetes")],
target = train_data$diabetes,
K = 5,
dup_size = 1) # Adjust duplication size: 1 will double the minority class instances. 1 synthetic sample per minority class sample is generated. # Convert to dataframe and ensure class is a factor
balanced_train <- data.frame(smote_output1$data)
table(balanced_train$class)
neg pos
400 430
balanced_train$class = factor(balanced_train$class)set.seed(123) # Ensure reproducibility
mycontrol <- trainControl(
method = "cv", # Cross-validation
number = 10, # 10-fold
classProbs = TRUE, # Enable probability estimation
summaryFunction = twoClassSummary # Compute ROC, Sensitivity, Specificity
)logistic_model <- train(
class ~ .,
data = balanced_train,
method = "glm",
family = "binomial",
trControl = mycontrol
)Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
# Print model summary
print(logistic_model)Generalized Linear Model
830 samples
8 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ...
Resampling results:
ROC Sens Spec
0.8286628 0.745 0.7534884
# Predict class labels
predictions <- predict(logistic_model, newdata = test_data)
# Predict probabilities
#pred_prob <- predict(logistic_model, newdata = test_data, type = "prob")predictions <- predict(logistic_model, newdata = test_data)#predictionsconfusionMatrix(predictions, test_data$diabetes, positive = "pos")Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 81 13
pos 19 40
Accuracy : 0.7908
95% CI : (0.7178, 0.8523)
No Information Rate : 0.6536
P-Value [Acc > NIR] : 0.0001499
Kappa : 0.5501
Mcnemar's Test P-Value : 0.3767591
Sensitivity : 0.7547
Specificity : 0.8100
Pos Pred Value : 0.6780
Neg Pred Value : 0.8617
Prevalence : 0.3464
Detection Rate : 0.2614
Detection Prevalence : 0.3856
Balanced Accuracy : 0.7824
'Positive' Class : pos
pred_prob <- predict(logistic_model, newdata = test_data, type = "prob")library(pROC)Type 'citation("pROC")' for a citation.
Attaching package: 'pROC'
The following objects are masked from 'package:stats':
cov, smooth, var
roc_curve <- roc(test_data$diabetes, pred_prob[,2]) # Use probability of class 1Setting levels: control = neg, case = pos
Setting direction: controls < cases
auc(roc_curve) # Get AUC valueArea under the curve: 0.8949
plot(roc_curve, col = "blue", main = "ROC Curve for Logistic Regression",
legacy.axes = TRUE)
# Add AUC to the plot
auc_value <- auc(roc_curve)
legend("bottomright", legend = paste("LR: AUC =", round(auc_value, 3)), col = "blue", lwd = 3)knn_model <- train(
class ~ .,
data = balanced_train,
method = "knn",
trControl = mycontrol,
tuneLength = 10
)Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
# Print model summary
print(knn_model)k-Nearest Neighbors
830 samples
8 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ...
Resampling results across tuning parameters:
k ROC Sens Spec
5 0.8124128 0.6450 0.8279070
7 0.8066570 0.6250 0.8116279
9 0.8093314 0.6300 0.8000000
11 0.8039244 0.6300 0.8023256
13 0.8037791 0.6250 0.7976744
15 0.8027907 0.6350 0.8046512
17 0.8016860 0.6150 0.7976744
19 0.7999128 0.6225 0.7953488
21 0.7949128 0.6200 0.8000000
23 0.7947674 0.6050 0.8162791
ROC was used to select the optimal model using the largest value.
The final value used for the model was k = 5.
knn_model$bestTune k
1 5
print(knn_model)k-Nearest Neighbors
830 samples
8 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ...
Resampling results across tuning parameters:
k ROC Sens Spec
5 0.8124128 0.6450 0.8279070
7 0.8066570 0.6250 0.8116279
9 0.8093314 0.6300 0.8000000
11 0.8039244 0.6300 0.8023256
13 0.8037791 0.6250 0.7976744
15 0.8027907 0.6350 0.8046512
17 0.8016860 0.6150 0.7976744
19 0.7999128 0.6225 0.7953488
21 0.7949128 0.6200 0.8000000
23 0.7947674 0.6050 0.8162791
ROC was used to select the optimal model using the largest value.
The final value used for the model was k = 5.
# Predict class labels
knn_predictions <- predict(knn_model, newdata = test_data)confusionMatrix(knn_predictions, test_data$diabetes, positive = "pos")Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 68 11
pos 32 42
Accuracy : 0.719
95% CI : (0.6407, 0.7886)
No Information Rate : 0.6536
P-Value [Acc > NIR] : 0.051516
Kappa : 0.4322
Mcnemar's Test P-Value : 0.002289
Sensitivity : 0.7925
Specificity : 0.6800
Pos Pred Value : 0.5676
Neg Pred Value : 0.8608
Prevalence : 0.3464
Detection Rate : 0.2745
Detection Prevalence : 0.4837
Balanced Accuracy : 0.7362
'Positive' Class : pos
pred_prob_knn <- predict(knn_model, newdata = test_data, type = "prob")roc_curve <- roc(test_data$diabetes, pred_prob_knn[,2]) # Use probability of class 1Setting levels: control = neg, case = pos
Setting direction: controls < cases
auc(roc_curve) # Get AUC valueArea under the curve: 0.794
plot(roc_curve, col = "blue", main = "ROC Curve for KNN",
legacy.axes = TRUE)
# Add AUC to the plot
auc_value <- auc(roc_curve)
legend("bottomright", legend = paste("KNN: AUC =", round(auc_value, 3)), col = "blue", lwd = 3)library(ranger)set.seed(123) # For reproducibility
# Define tuning grid
tune_grid <- expand.grid(
mtry = c(2, 4, 6, 8), # Number of features per split
splitrule = c("gini", "extratrees"), # Splitting criterion
min.node.size = c(1, 3, 5) # Minimum samples per terminal node
)rf_model <- train(
class ~ .,
data = balanced_train,
method = "ranger", # Fast Random Forest
trControl = mycontrol,
tuneGrid = tune_grid, # Use custom hyperparameter tuning grid
num.trees = 500, # Number of trees
importance = "impurity" # Compute variable importance
)Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
# Print the best tuned parameters
print(rf_model$bestTune) mtry splitrule min.node.size
4 2 extratrees 1
print(rf_model)Random Forest
830 samples
8 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ...
Resampling results across tuning parameters:
mtry splitrule min.node.size ROC Sens Spec
2 gini 1 0.9042151 0.7625 0.8790698
2 gini 3 0.9033140 0.7525 0.8790698
2 gini 5 0.8991279 0.7500 0.8604651
2 extratrees 1 0.9177907 0.7675 0.8930233
2 extratrees 3 0.9143605 0.7575 0.8930233
2 extratrees 5 0.9053488 0.7650 0.8790698
4 gini 1 0.8993023 0.7575 0.8674419
4 gini 3 0.8949419 0.7500 0.8697674
4 gini 5 0.8947674 0.7575 0.8604651
4 extratrees 1 0.9145930 0.7675 0.8813953
4 extratrees 3 0.9114244 0.7550 0.8883721
4 extratrees 5 0.9076163 0.7725 0.8767442
6 gini 1 0.8975872 0.7525 0.8534884
6 gini 3 0.8941279 0.7550 0.8581395
6 gini 5 0.8920058 0.7475 0.8558140
6 extratrees 1 0.9136628 0.7775 0.8813953
6 extratrees 3 0.9110465 0.7625 0.8790698
6 extratrees 5 0.9070930 0.7725 0.8697674
8 gini 1 0.8938372 0.7575 0.8534884
8 gini 3 0.8919186 0.7475 0.8488372
8 gini 5 0.8916860 0.7525 0.8488372
8 extratrees 1 0.9099709 0.7625 0.8790698
8 extratrees 3 0.9098547 0.7600 0.8767442
8 extratrees 5 0.9066860 0.7825 0.8744186
ROC was used to select the optimal model using the largest value.
The final values used for the model were mtry = 2, splitrule = extratrees
and min.node.size = 1.
plot(rf_model)varImp(rf_model)ranger variable importance
Overall
glucose 100.000
age 50.334
mass 38.876
pedigree 26.692
pregnant 24.997
pressure 15.350
triceps 6.924
insulin 0.000
rf_predictions <- predict(rf_model, newdata = test_data)
rf_pred_prob <- predict(rf_model, newdata = test_data, type = "prob")confusionMatrix(rf_predictions, test_data$diabetes, positive = "pos")Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 80 11
pos 20 42
Accuracy : 0.7974
95% CI : (0.7249, 0.858)
No Information Rate : 0.6536
P-Value [Acc > NIR] : 7.169e-05
Kappa : 0.5697
Mcnemar's Test P-Value : 0.1508
Sensitivity : 0.7925
Specificity : 0.8000
Pos Pred Value : 0.6774
Neg Pred Value : 0.8791
Prevalence : 0.3464
Detection Rate : 0.2745
Detection Prevalence : 0.4052
Balanced Accuracy : 0.7962
'Positive' Class : pos
library(pROC)
rf_roc <- roc(test_data$diabetes, rf_pred_prob[,2]) # Class 1 probabilitiesSetting levels: control = neg, case = pos
Setting direction: controls < cases
plot(rf_roc, legacy.axes = TRUE, col = "green", lwd = 2, main = "ROC Curve - Random Forest")
auc_value <- auc(rf_roc)
legend("bottomright", legend = paste("AUC =", round(auc_value, 3)), col = "green", lwd = 2)library(caret)
library(xgboost)
library(pROC) # For ROC curve analysisset.seed(123)
grid_default <- expand.grid(
nrounds = 100,
max_depth = 6,
eta = 0.3,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)xgb_model <- train(
class ~ .,
data = balanced_train,
method = "xgbTree", # XGBoost classifier
trControl = mycontrol, # Use the same train control
tuneGrid = grid_default, # Use hyperparameter tuning grid
metric = "ROC" # Optimize for accuracy
)
# Print the best tuned parameters
print(xgb_model)eXtreme Gradient Boosting
830 samples
8 predictor
2 classes: 'neg', 'pos'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 747, 747, 747, 747, 747, 747, ...
Resampling results:
ROC Sens Spec
0.8878488 0.7575 0.8395349
Tuning parameter 'nrounds' was held constant at a value of 100
Tuning
held constant at a value of 1
Tuning parameter 'subsample' was held
constant at a value of 1
varImp(xgb_model)xgbTree variable importance
Overall
glucose 100.00
mass 49.28
age 38.15
pedigree 27.33
pressure 14.75
pregnant 12.57
insulin 0.95
triceps 0.00
xgb_predictions <- predict(xgb_model, newdata = test_data)
xgb_pred_prob <- predict(xgb_model, newdata = test_data, type = "prob")confusionMatrix(xgb_predictions, test_data$diabetes, mode = "everything", positive = "pos")Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 82 16
pos 18 37
Accuracy : 0.7778
95% CI : (0.7036, 0.8409)
No Information Rate : 0.6536
P-Value [Acc > NIR] : 0.000586
Kappa : 0.5136
Mcnemar's Test P-Value : 0.863832
Sensitivity : 0.6981
Specificity : 0.8200
Pos Pred Value : 0.6727
Neg Pred Value : 0.8367
Precision : 0.6727
Recall : 0.6981
F1 : 0.6852
Prevalence : 0.3464
Detection Rate : 0.2418
Detection Prevalence : 0.3595
Balanced Accuracy : 0.7591
'Positive' Class : pos
xgb_roc <- roc(test_data$diabetes, rf_pred_prob[,2]) # Class 1 probabilitiesSetting levels: control = neg, case = pos
Setting direction: controls < cases
plot(xgb_roc, legacy.axes = TRUE, col = "green", lwd = 2, main = "ROC Curve - XGBoost")
auc_value <- auc(xgb_roc)
legend("bottomright", legend = paste("AUC =", round(auc_value, 3)), col = "green", lwd = 2)# https://www.kaggle.com/code/pelkoja/visual-xgboost-tuning-with-caretlibrary(PRROC)# Compute PR Curve using PRROC package
pr_curve <- pr.curve(scores.class0 = xgb_pred_prob[,2], weights.class0 = as.numeric(test_data$diabetes) - 1, curve = TRUE)
# Plot Precision-Recall Curve
plot(pr_curve, col = "red", main = "Precision-Recall Curve - XGB")