#  Load libraries

library(readxl)
library(dplyr)
library(randomForest)
library(caret)
library(pROC)
library(ggplot2)

# Read the database
IMMAm <- read_excel("C:/Users/Samid Limon/Desktop/Version Final/IMMAm.xlsx")
IMMAh <- read_excel("C:/Users/Samid Limon/Desktop/Version Final/IMMAh.xlsx")

# Encode the dependent variable
IMMAm$Clas_ALMI <- ifelse(IMMAm$Clas_ALMI == "NORMAL", 1, ifelse(IMMAm$Clas_ALMI == "BAJO", 2, NA))
IMMAm$Clas_ALMI <- as.factor(IMMAm$Clas_ALMI)
IMMAh$Clas_IMMA <- ifelse(IMMAh$Clas_IMMA == "NORMAL", 1, ifelse(IMMAh$Clas_IMMA == "BAJO", 2, NA))
IMMAh$Clas_IMMA <- as.factor(IMMAh$Clas_IMMA)

IMMAm <- IMMAm[!is.na(IMMAm$Clas_ALMI), ]
IMMAh <- IMMAh[!is.na(IMMAh$Clas_IMMA), ]

# Partition identical (females)
set.seed(123)

trainsetALMIm <- IMMAm %>% group_by(Clas_ALMI) %>% sample_frac(0.7)
testsetALMIm  <- IMMAm %>% anti_join(trainsetALMIm, by = colnames(IMMAm))

# Partition identical  (males)
trainsetALMIh <- IMMAh %>% group_by(Clas_IMMA) %>% sample_frac(0.7)
testsetALMIh  <- IMMAh %>% anti_join(trainsetALMIh, by = colnames(IMMAh))

Random Forest

# Random Forest model
# Females
rf_script_m <- randomForest(Clas_ALMI ~ ., data = trainsetALMIm, ntree = 500)
pred_script_m <- predict(rf_script_m, newdata = testsetALMIm)
conf_mtx_script_m <- confusionMatrix(pred_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_mtx_script_m)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 27  3
##          2  2  4
##                                          
##                Accuracy : 0.8611         
##                  95% CI : (0.705, 0.9533)
##     No Information Rate : 0.8056         
##     P-Value [Acc > NIR] : 0.273          
##                                          
##                   Kappa : 0.5312         
##                                          
##  Mcnemar's Test P-Value : 1.000          
##                                          
##             Sensitivity : 0.5714         
##             Specificity : 0.9310         
##          Pos Pred Value : 0.6667         
##          Neg Pred Value : 0.9000         
##              Prevalence : 0.1944         
##          Detection Rate : 0.1111         
##    Detection Prevalence : 0.1667         
##       Balanced Accuracy : 0.7512         
##                                          
##        'Positive' Class : 2              
##

prob_script_m <- predict(rf_script_m, newdata = testsetALMIm, type = "prob")[, "2"]
roc_script_m <- roc(testsetALMIm$Clas_ALMI, prob_script_m, levels = c("1", "2"))
auc_script_m <- auc(roc_script_m)
plot(roc_script_m, main = "ROC Curve - RF Females")

# Males
rf_script_h <- randomForest(Clas_IMMA ~ ., data = trainsetALMIh, ntree = 500)
pred_script_h <- predict(rf_script_h, newdata = testsetALMIh)
conf_mtx_script_h <- confusionMatrix(pred_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_mtx_script_h)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 12  1
##          2  1  3
##                                           
##                Accuracy : 0.8824          
##                  95% CI : (0.6356, 0.9854)
##     No Information Rate : 0.7647          
##     P-Value [Acc > NIR] : 0.1998          
##                                           
##                   Kappa : 0.6731          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.7500          
##             Specificity : 0.9231          
##          Pos Pred Value : 0.7500          
##          Neg Pred Value : 0.9231          
##              Prevalence : 0.2353          
##          Detection Rate : 0.1765          
##    Detection Prevalence : 0.2353          
##       Balanced Accuracy : 0.8365          
##                                           
##        'Positive' Class : 2               
##

prob_script_h <- predict(rf_script_h, newdata = testsetALMIh, type = "prob")[, "2"]
roc_script_h <- roc(testsetALMIh$Clas_IMMA, prob_script_h, levels = c("1", "2"))
auc_script_h <- auc(roc_script_h)
plot(roc_script_h, main = "ROC Curve- RF Males")

Logistic Regression

# --- Females ---
glm_script_m <- glm(Clas_ALMI ~ ., data = trainsetALMIm, family = "binomial")
prob_glm_script_m <- predict(glm_script_m, newdata = testsetALMIm, type = "response")
pred_glm_script_m <- ifelse(prob_glm_script_m >= 0.5, 2, 1)
pred_glm_script_m <- as.factor(pred_glm_script_m)
conf_glm_script_m <- confusionMatrix(pred_glm_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_glm_script_m)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 27  3
##          2  2  4
##                                          
##                Accuracy : 0.8611         
##                  95% CI : (0.705, 0.9533)
##     No Information Rate : 0.8056         
##     P-Value [Acc > NIR] : 0.273          
##                                          
##                   Kappa : 0.5312         
##                                          
##  Mcnemar's Test P-Value : 1.000          
##                                          
##             Sensitivity : 0.5714         
##             Specificity : 0.9310         
##          Pos Pred Value : 0.6667         
##          Neg Pred Value : 0.9000         
##              Prevalence : 0.1944         
##          Detection Rate : 0.1111         
##    Detection Prevalence : 0.1667         
##       Balanced Accuracy : 0.7512         
##                                          
##        'Positive' Class : 2              
##

roc_glm_script_m <- roc(testsetALMIm$Clas_ALMI, prob_glm_script_m, levels = c("1", "2"))
auc_glm_script_m <- auc(roc_glm_script_m)
plot(roc_glm_script_m, main = "ROC Curve - GLM Females")

# --- Males ---
glm_script_h <- glm(Clas_IMMA ~ ., data = trainsetALMIh, family = "binomial")
prob_glm_script_h <- predict(glm_script_h, newdata = testsetALMIh, type = "response")
pred_glm_script_h <- ifelse(prob_glm_script_h >= 0.5, 2, 1)
pred_glm_script_h <- as.factor(pred_glm_script_h)
conf_glm_script_h <- confusionMatrix(pred_glm_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_glm_script_h)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 10  1
##          2  3  3
##                                          
##                Accuracy : 0.7647         
##                  95% CI : (0.501, 0.9319)
##     No Information Rate : 0.7647         
##     P-Value [Acc > NIR] : 0.6300         
##                                          
##                   Kappa : 0.4426         
##                                          
##  Mcnemar's Test P-Value : 0.6171         
##                                          
##             Sensitivity : 0.7500         
##             Specificity : 0.7692         
##          Pos Pred Value : 0.5000         
##          Neg Pred Value : 0.9091         
##              Prevalence : 0.2353         
##          Detection Rate : 0.1765         
##    Detection Prevalence : 0.3529         
##       Balanced Accuracy : 0.7596         
##                                          
##        'Positive' Class : 2              
##

roc_glm_script_h <- roc(testsetALMIh$Clas_IMMA, prob_glm_script_h, levels = c("1", "2"))
auc_glm_script_h <- auc(roc_glm_script_h)
plot(roc_glm_script_h, main = "ROC Curve - GLM Males")

Decision Tree

library(rpart)
library(rpart.plot)

# --- Females ---
dt_script_m <- rpart(Clas_ALMI ~ ., data = trainsetALMIm, method = "class")
rpart.plot(dt_script_m, main = "Decision Tree Females")

pred_dt_script_m <- predict(dt_script_m, newdata = testsetALMIm, type = "class")
conf_dt_script_m <- confusionMatrix(pred_dt_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_dt_script_m)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 29  3
##          2  0  4
##                                           
##                Accuracy : 0.9167          
##                  95% CI : (0.7753, 0.9825)
##     No Information Rate : 0.8056          
##     P-Value [Acc > NIR] : 0.06112         
##                                           
##                   Kappa : 0.6824          
##                                           
##  Mcnemar's Test P-Value : 0.24821         
##                                           
##             Sensitivity : 0.5714          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9062          
##              Prevalence : 0.1944          
##          Detection Rate : 0.1111          
##    Detection Prevalence : 0.1111          
##       Balanced Accuracy : 0.7857          
##                                           
##        'Positive' Class : 2               
##

prob_dt_script_m <- predict(dt_script_m, newdata = testsetALMIm)[, "2"]
roc_dt_script_m <- roc(testsetALMIm$Clas_ALMI, prob_dt_script_m, levels = c("1", "2"))
auc_dt_script_m <- auc(roc_dt_script_m)
plot(roc_dt_script_m, main = "ROC Curve - DT Females")

# --- Males ---
dt_script_h <- rpart(Clas_IMMA ~ ., data = trainsetALMIh, method = "class")
rpart.plot(dt_script_h, main = "Decision Tree (Males)")

pred_dt_script_h <- predict(dt_script_h, newdata = testsetALMIh, type = "class")
conf_dt_script_h <- confusionMatrix(pred_dt_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_dt_script_h)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 11  1
##          2  2  3
##                                          
##                Accuracy : 0.8235         
##                  95% CI : (0.5657, 0.962)
##     No Information Rate : 0.7647         
##     P-Value [Acc > NIR] : 0.4069         
##                                          
##                   Kappa : 0.5487         
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.7500         
##             Specificity : 0.8462         
##          Pos Pred Value : 0.6000         
##          Neg Pred Value : 0.9167         
##              Prevalence : 0.2353         
##          Detection Rate : 0.1765         
##    Detection Prevalence : 0.2941         
##       Balanced Accuracy : 0.7981         
##                                          
##        'Positive' Class : 2              
##

prob_dt_script_h <- predict(dt_script_h, newdata = testsetALMIh)[, "2"]
roc_dt_script_h <- roc(testsetALMIh$Clas_IMMA, prob_dt_script_h, levels = c("1", "2"))
auc_dt_script_h <- auc(roc_dt_script_h)
plot(roc_dt_script_h, main = "ROC Curve - DT Males")

Artificial Neural Network (ANN)

library(nnet)
# --- Females ---
ann_script_m <- nnet(Clas_ALMI ~ ., data = trainsetALMIm, size = 5, maxit = 500, decay = 5e-4, trace = FALSE)
prob_ann_script_m <- predict(ann_script_m, newdata = testsetALMIm, type = "raw")
pred_ann_script_m <- ifelse(prob_ann_script_m > 0.5, 2, 1)
pred_ann_script_m <- as.factor(pred_ann_script_m)
conf_ann_script_m <- confusionMatrix(pred_ann_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_ann_script_m)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 25  5
##          2  4  2
##                                          
##                Accuracy : 0.75           
##                  95% CI : (0.578, 0.8788)
##     No Information Rate : 0.8056         
##     P-Value [Acc > NIR] : 0.8535         
##                                          
##                   Kappa : 0.1562         
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.28571        
##             Specificity : 0.86207        
##          Pos Pred Value : 0.33333        
##          Neg Pred Value : 0.83333        
##              Prevalence : 0.19444        
##          Detection Rate : 0.05556        
##    Detection Prevalence : 0.16667        
##       Balanced Accuracy : 0.57389        
##                                          
##        'Positive' Class : 2              
##

roc_ann_script_m <- roc(testsetALMIm$Clas_ALMI, as.vector(prob_ann_script_m), levels = c("1", "2"))
auc_ann_script_m <- auc(roc_ann_script_m)
plot(roc_ann_script_m, main = "ROC Curve - ANN Females")

# --- Males ---
ann_script_h <- nnet(Clas_IMMA ~ ., data = trainsetALMIh, size = 5, maxit = 500, decay = 5e-4, trace = FALSE)
prob_ann_script_h <- predict(ann_script_h, newdata = testsetALMIh, type = "raw")
pred_ann_script_h <- ifelse(prob_ann_script_h > 0.5, 2, 1)
pred_ann_script_h <- as.factor(pred_ann_script_h)
conf_ann_script_h <- confusionMatrix(pred_ann_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_ann_script_h)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 11  1
##          2  2  3
##                                          
##                Accuracy : 0.8235         
##                  95% CI : (0.5657, 0.962)
##     No Information Rate : 0.7647         
##     P-Value [Acc > NIR] : 0.4069         
##                                          
##                   Kappa : 0.5487         
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.7500         
##             Specificity : 0.8462         
##          Pos Pred Value : 0.6000         
##          Neg Pred Value : 0.9167         
##              Prevalence : 0.2353         
##          Detection Rate : 0.1765         
##    Detection Prevalence : 0.2941         
##       Balanced Accuracy : 0.7981         
##                                          
##        'Positive' Class : 2              
##

roc_ann_script_h <- roc(testsetALMIh$Clas_IMMA, as.vector(prob_ann_script_h), levels = c("1", "2"))
auc_ann_script_h <- auc(roc_ann_script_h)
plot(roc_ann_script_h, main = "ROC Curve - ANN Males")

library(nnet)
library(NeuralNetTools)

# Females
plotnet(ann_script_m, alpha = 0.7, circle_col = "#2F4731")
title(main = "Artificial Neural Network Diagram – Females")

# Males
plotnet(ann_script_h, alpha = 0.7, circle_col = "#3063a7")
title(main = "Artificial Neural Network Diagram – Males")

LASSO

library(glmnet)
# --- Females ---
x_script_m <- model.matrix(Clas_ALMI ~ ., data = trainsetALMIm)[,-1]
y_script_m <- as.numeric(as.character(trainsetALMIm$Clas_ALMI))
lasso_script_m <- cv.glmnet(x_script_m, y_script_m, alpha = 1, family = "binomial")
x_test_script_m <- model.matrix(Clas_ALMI ~ ., data = testsetALMIm)[,-1]
prob_lasso_script_m <- predict(lasso_script_m, s = "lambda.min", newx = x_test_script_m, type = "response")
pred_lasso_script_m <- ifelse(prob_lasso_script_m > 0.5, 2, 1)
pred_lasso_script_m <- as.factor(pred_lasso_script_m)
conf_lasso_script_m <- confusionMatrix(pred_lasso_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_lasso_script_m)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 28  3
##          2  1  4
##                                           
##                Accuracy : 0.8889          
##                  95% CI : (0.7394, 0.9689)
##     No Information Rate : 0.8056          
##     P-Value [Acc > NIR] : 0.1444          
##                                           
##                   Kappa : 0.6022          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.5714          
##             Specificity : 0.9655          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.9032          
##              Prevalence : 0.1944          
##          Detection Rate : 0.1111          
##    Detection Prevalence : 0.1389          
##       Balanced Accuracy : 0.7685          
##                                           
##        'Positive' Class : 2               
##

roc_lasso_script_m <- roc(testsetALMIm$Clas_ALMI, as.vector(prob_lasso_script_m), levels = c("1", "2"))
auc_lasso_script_m <- auc(roc_lasso_script_m)
plot(roc_lasso_script_m, main = "ROC Curve - LASSO Females")

# --- Males ---
x_script_h <- model.matrix(Clas_IMMA ~ ., data = trainsetALMIh)[,-1]
y_script_h <- as.numeric(as.character(trainsetALMIh$Clas_IMMA))
lasso_script_h <- cv.glmnet(x_script_h, y_script_h, alpha = 1, family = "binomial")
x_test_script_h <- model.matrix(Clas_IMMA ~ ., data = testsetALMIh)[,-1]
prob_lasso_script_h <- predict(lasso_script_h, s = "lambda.min", newx = x_test_script_h, type = "response")
pred_lasso_script_h <- ifelse(prob_lasso_script_h > 0.5, 2, 1)
pred_lasso_script_h <- as.factor(pred_lasso_script_h)
conf_lasso_script_h <- confusionMatrix(pred_lasso_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_lasso_script_h)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 11  1
##          2  2  3
##                                          
##                Accuracy : 0.8235         
##                  95% CI : (0.5657, 0.962)
##     No Information Rate : 0.7647         
##     P-Value [Acc > NIR] : 0.4069         
##                                          
##                   Kappa : 0.5487         
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.7500         
##             Specificity : 0.8462         
##          Pos Pred Value : 0.6000         
##          Neg Pred Value : 0.9167         
##              Prevalence : 0.2353         
##          Detection Rate : 0.1765         
##    Detection Prevalence : 0.2941         
##       Balanced Accuracy : 0.7981         
##                                          
##        'Positive' Class : 2              
##

roc_lasso_script_h <- roc(testsetALMIh$Clas_IMMA, as.vector(prob_lasso_script_h), levels = c("1", "2"))
auc_lasso_script_h <- auc(roc_lasso_script_h)
plot(roc_lasso_script_h, main = "ROC Curve - LASSO Males")

Graphical comparison of ROC curves by sex

# Females
plot(roc_glm_script_m, col="blue", lwd=2, main="ROC Curves – All models (Females)")
plot(roc_dt_script_m, col="red", add=TRUE, lwd=2)
plot(roc_script_m, col="forestgreen", add=TRUE, lwd=2) # Random Forest
plot(roc_ann_script_m, col="purple", add=TRUE, lwd=2)
plot(roc_lasso_script_m, col="orange", add=TRUE, lwd=2)
legend("bottomright",
       legend=c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
       col=c("blue", "red", "forestgreen", "purple", "orange"), lwd=2, cex=0.8)

# Males
plot(roc_glm_script_h, col="blue", lwd=2, main="ROC Curves – All models (Males)")
plot(roc_dt_script_h, col="red", add=TRUE, lwd=2)
plot(roc_script_h, col="forestgreen", add=TRUE, lwd=2) # Random Forest
plot(roc_ann_script_h, col="purple", add=TRUE, lwd=2)
plot(roc_lasso_script_h, col="orange", add=TRUE, lwd=2)
legend("bottomright",
       legend=c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
       col=c("blue", "red", "forestgreen", "purple", "orange"), lwd=2, cex=0.8)

Summary metrics table

# Females
tab_m <- data.frame(
  Modelo = c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
  Accuracy    = c(conf_glm_script_m$overall["Accuracy"], conf_dt_script_m$overall["Accuracy"], conf_mtx_script_m$overall["Accuracy"], conf_ann_script_m$overall["Accuracy"], conf_lasso_script_m$overall["Accuracy"]),
  Sensitivity = c(conf_glm_script_m$byClass["Sensitivity"], conf_dt_script_m$byClass["Sensitivity"], conf_mtx_script_m$byClass["Sensitivity"], conf_ann_script_m$byClass["Sensitivity"], conf_lasso_script_m$byClass["Sensitivity"]),
  Specificity = c(conf_glm_script_m$byClass["Specificity"], conf_dt_script_m$byClass["Specificity"], conf_mtx_script_m$byClass["Specificity"], conf_ann_script_m$byClass["Specificity"], conf_lasso_script_m$byClass["Specificity"]),
  Precision   = c(conf_glm_script_m$byClass["Precision"], conf_dt_script_m$byClass["Precision"], conf_mtx_script_m$byClass["Precision"], conf_ann_script_m$byClass["Precision"], conf_lasso_script_m$byClass["Precision"]),
  AUC         = c(auc_glm_script_m, auc_dt_script_m, auc_script_m, auc_ann_script_m, auc_lasso_script_m)
)
knitr::kable(tab_m, digits=3, caption="Table 1. Performance of ML models for the prediction of low ALMI in females")

Table 1. Performance of ML models for the prediction of low ALMI in females
Modelo	Accuracy	Sensitivity	Specificity	Precision	AUC
Logistic Regression	0.861	0.571	0.931	0.667	0.761
Decision Tree	0.917	0.571	1.000	1.000	0.835
Random Forest	0.861	0.571	0.931	0.667	0.815
ANN	0.750	0.286	0.862	0.333	0.847
LASSO	0.889	0.571	0.966	0.800	0.842

# Males
tab_h <- data.frame(
  Modelo = c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
  Accuracy    = c(conf_glm_script_h$overall["Accuracy"], conf_dt_script_h$overall["Accuracy"], conf_mtx_script_h$overall["Accuracy"], conf_ann_script_h$overall["Accuracy"], conf_lasso_script_h$overall["Accuracy"]),
  Sensitivity = c(conf_glm_script_h$byClass["Sensitivity"], conf_dt_script_h$byClass["Sensitivity"], conf_mtx_script_h$byClass["Sensitivity"], conf_ann_script_h$byClass["Sensitivity"], conf_lasso_script_h$byClass["Sensitivity"]),
  Specificity = c(conf_glm_script_h$byClass["Specificity"], conf_dt_script_h$byClass["Specificity"], conf_mtx_script_h$byClass["Specificity"], conf_ann_script_h$byClass["Specificity"], conf_lasso_script_h$byClass["Specificity"]),
  Precision   = c(conf_glm_script_h$byClass["Precision"], conf_dt_script_h$byClass["Precision"], conf_mtx_script_h$byClass["Precision"], conf_ann_script_h$byClass["Precision"], conf_lasso_script_h$byClass["Precision"]),
  AUC         = c(auc_glm_script_h, auc_dt_script_h, auc_script_h, auc_ann_script_h, auc_lasso_script_h)
)
knitr::kable(tab_h, digits=3, caption="Table 2. Performance of ML models for the prediction of low ALMI in males")

Table 2. Performance of ML models for the prediction of low ALMI in males
Modelo	Accuracy	Sensitivity	Specificity	Precision	AUC
Logistic Regression	0.765	0.75	0.769	0.50	0.769
Decision Tree	0.824	0.75	0.846	0.60	0.798
Random Forest	0.882	0.75	0.923	0.75	0.788
ANN	0.824	0.75	0.846	0.60	0.923
LASSO	0.824	0.75	0.846	0.60	0.865

Machine Learning Models for Prediction of Low ALMI

Ana González-Martín, Edgar Samid Limón-Villegas, Zyanya Reyes Castillo, Nicoletta Righini

2025-05-27

Random Forest

Logistic Regression

Decision Tree

Artificial Neural Network (ANN)

LASSO

Graphical comparison of ROC curves by sex

Summary metrics table