# Load libraries
library(readxl)
library(dplyr)
library(randomForest)
library(caret)
library(pROC)
library(ggplot2)
# Read the database
IMMAm <- read_excel("C:/Users/Samid Limon/Desktop/Version Final/IMMAm.xlsx")
IMMAh <- read_excel("C:/Users/Samid Limon/Desktop/Version Final/IMMAh.xlsx")
# Encode the dependent variable
IMMAm$Clas_ALMI <- ifelse(IMMAm$Clas_ALMI == "NORMAL", 1, ifelse(IMMAm$Clas_ALMI == "BAJO", 2, NA))
IMMAm$Clas_ALMI <- as.factor(IMMAm$Clas_ALMI)
IMMAh$Clas_IMMA <- ifelse(IMMAh$Clas_IMMA == "NORMAL", 1, ifelse(IMMAh$Clas_IMMA == "BAJO", 2, NA))
IMMAh$Clas_IMMA <- as.factor(IMMAh$Clas_IMMA)
IMMAm <- IMMAm[!is.na(IMMAm$Clas_ALMI), ]
IMMAh <- IMMAh[!is.na(IMMAh$Clas_IMMA), ]
# Partition identical (females)
set.seed(123)
trainsetALMIm <- IMMAm %>% group_by(Clas_ALMI) %>% sample_frac(0.7)
testsetALMIm <- IMMAm %>% anti_join(trainsetALMIm, by = colnames(IMMAm))
# Partition identical (males)
trainsetALMIh <- IMMAh %>% group_by(Clas_IMMA) %>% sample_frac(0.7)
testsetALMIh <- IMMAh %>% anti_join(trainsetALMIh, by = colnames(IMMAh))
Random Forest
# Random Forest model
# Females
rf_script_m <- randomForest(Clas_ALMI ~ ., data = trainsetALMIm, ntree = 500)
pred_script_m <- predict(rf_script_m, newdata = testsetALMIm)
conf_mtx_script_m <- confusionMatrix(pred_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_mtx_script_m)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 27 3
## 2 2 4
##
## Accuracy : 0.8611
## 95% CI : (0.705, 0.9533)
## No Information Rate : 0.8056
## P-Value [Acc > NIR] : 0.273
##
## Kappa : 0.5312
##
## Mcnemar's Test P-Value : 1.000
##
## Sensitivity : 0.5714
## Specificity : 0.9310
## Pos Pred Value : 0.6667
## Neg Pred Value : 0.9000
## Prevalence : 0.1944
## Detection Rate : 0.1111
## Detection Prevalence : 0.1667
## Balanced Accuracy : 0.7512
##
## 'Positive' Class : 2
##
prob_script_m <- predict(rf_script_m, newdata = testsetALMIm, type = "prob")[, "2"]
roc_script_m <- roc(testsetALMIm$Clas_ALMI, prob_script_m, levels = c("1", "2"))
auc_script_m <- auc(roc_script_m)
plot(roc_script_m, main = "ROC Curve - RF Females")

# Males
rf_script_h <- randomForest(Clas_IMMA ~ ., data = trainsetALMIh, ntree = 500)
pred_script_h <- predict(rf_script_h, newdata = testsetALMIh)
conf_mtx_script_h <- confusionMatrix(pred_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_mtx_script_h)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 12 1
## 2 1 3
##
## Accuracy : 0.8824
## 95% CI : (0.6356, 0.9854)
## No Information Rate : 0.7647
## P-Value [Acc > NIR] : 0.1998
##
## Kappa : 0.6731
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7500
## Specificity : 0.9231
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.9231
## Prevalence : 0.2353
## Detection Rate : 0.1765
## Detection Prevalence : 0.2353
## Balanced Accuracy : 0.8365
##
## 'Positive' Class : 2
##
prob_script_h <- predict(rf_script_h, newdata = testsetALMIh, type = "prob")[, "2"]
roc_script_h <- roc(testsetALMIh$Clas_IMMA, prob_script_h, levels = c("1", "2"))
auc_script_h <- auc(roc_script_h)
plot(roc_script_h, main = "ROC Curve- RF Males")

Logistic Regression
# --- Females ---
glm_script_m <- glm(Clas_ALMI ~ ., data = trainsetALMIm, family = "binomial")
prob_glm_script_m <- predict(glm_script_m, newdata = testsetALMIm, type = "response")
pred_glm_script_m <- ifelse(prob_glm_script_m >= 0.5, 2, 1)
pred_glm_script_m <- as.factor(pred_glm_script_m)
conf_glm_script_m <- confusionMatrix(pred_glm_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_glm_script_m)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 27 3
## 2 2 4
##
## Accuracy : 0.8611
## 95% CI : (0.705, 0.9533)
## No Information Rate : 0.8056
## P-Value [Acc > NIR] : 0.273
##
## Kappa : 0.5312
##
## Mcnemar's Test P-Value : 1.000
##
## Sensitivity : 0.5714
## Specificity : 0.9310
## Pos Pred Value : 0.6667
## Neg Pred Value : 0.9000
## Prevalence : 0.1944
## Detection Rate : 0.1111
## Detection Prevalence : 0.1667
## Balanced Accuracy : 0.7512
##
## 'Positive' Class : 2
##
roc_glm_script_m <- roc(testsetALMIm$Clas_ALMI, prob_glm_script_m, levels = c("1", "2"))
auc_glm_script_m <- auc(roc_glm_script_m)
plot(roc_glm_script_m, main = "ROC Curve - GLM Females")

# --- Males ---
glm_script_h <- glm(Clas_IMMA ~ ., data = trainsetALMIh, family = "binomial")
prob_glm_script_h <- predict(glm_script_h, newdata = testsetALMIh, type = "response")
pred_glm_script_h <- ifelse(prob_glm_script_h >= 0.5, 2, 1)
pred_glm_script_h <- as.factor(pred_glm_script_h)
conf_glm_script_h <- confusionMatrix(pred_glm_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_glm_script_h)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 10 1
## 2 3 3
##
## Accuracy : 0.7647
## 95% CI : (0.501, 0.9319)
## No Information Rate : 0.7647
## P-Value [Acc > NIR] : 0.6300
##
## Kappa : 0.4426
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.7500
## Specificity : 0.7692
## Pos Pred Value : 0.5000
## Neg Pred Value : 0.9091
## Prevalence : 0.2353
## Detection Rate : 0.1765
## Detection Prevalence : 0.3529
## Balanced Accuracy : 0.7596
##
## 'Positive' Class : 2
##
roc_glm_script_h <- roc(testsetALMIh$Clas_IMMA, prob_glm_script_h, levels = c("1", "2"))
auc_glm_script_h <- auc(roc_glm_script_h)
plot(roc_glm_script_h, main = "ROC Curve - GLM Males")

Decision Tree
library(rpart)
library(rpart.plot)
# --- Females ---
dt_script_m <- rpart(Clas_ALMI ~ ., data = trainsetALMIm, method = "class")
rpart.plot(dt_script_m, main = "Decision Tree Females")

pred_dt_script_m <- predict(dt_script_m, newdata = testsetALMIm, type = "class")
conf_dt_script_m <- confusionMatrix(pred_dt_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_dt_script_m)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 29 3
## 2 0 4
##
## Accuracy : 0.9167
## 95% CI : (0.7753, 0.9825)
## No Information Rate : 0.8056
## P-Value [Acc > NIR] : 0.06112
##
## Kappa : 0.6824
##
## Mcnemar's Test P-Value : 0.24821
##
## Sensitivity : 0.5714
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9062
## Prevalence : 0.1944
## Detection Rate : 0.1111
## Detection Prevalence : 0.1111
## Balanced Accuracy : 0.7857
##
## 'Positive' Class : 2
##
prob_dt_script_m <- predict(dt_script_m, newdata = testsetALMIm)[, "2"]
roc_dt_script_m <- roc(testsetALMIm$Clas_ALMI, prob_dt_script_m, levels = c("1", "2"))
auc_dt_script_m <- auc(roc_dt_script_m)
plot(roc_dt_script_m, main = "ROC Curve - DT Females")

# --- Males ---
dt_script_h <- rpart(Clas_IMMA ~ ., data = trainsetALMIh, method = "class")
rpart.plot(dt_script_h, main = "Decision Tree (Males)")

pred_dt_script_h <- predict(dt_script_h, newdata = testsetALMIh, type = "class")
conf_dt_script_h <- confusionMatrix(pred_dt_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_dt_script_h)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 11 1
## 2 2 3
##
## Accuracy : 0.8235
## 95% CI : (0.5657, 0.962)
## No Information Rate : 0.7647
## P-Value [Acc > NIR] : 0.4069
##
## Kappa : 0.5487
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7500
## Specificity : 0.8462
## Pos Pred Value : 0.6000
## Neg Pred Value : 0.9167
## Prevalence : 0.2353
## Detection Rate : 0.1765
## Detection Prevalence : 0.2941
## Balanced Accuracy : 0.7981
##
## 'Positive' Class : 2
##
prob_dt_script_h <- predict(dt_script_h, newdata = testsetALMIh)[, "2"]
roc_dt_script_h <- roc(testsetALMIh$Clas_IMMA, prob_dt_script_h, levels = c("1", "2"))
auc_dt_script_h <- auc(roc_dt_script_h)
plot(roc_dt_script_h, main = "ROC Curve - DT Males")

Artificial Neural Network (ANN)
library(nnet)
# --- Females ---
ann_script_m <- nnet(Clas_ALMI ~ ., data = trainsetALMIm, size = 5, maxit = 500, decay = 5e-4, trace = FALSE)
prob_ann_script_m <- predict(ann_script_m, newdata = testsetALMIm, type = "raw")
pred_ann_script_m <- ifelse(prob_ann_script_m > 0.5, 2, 1)
pred_ann_script_m <- as.factor(pred_ann_script_m)
conf_ann_script_m <- confusionMatrix(pred_ann_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_ann_script_m)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 25 5
## 2 4 2
##
## Accuracy : 0.75
## 95% CI : (0.578, 0.8788)
## No Information Rate : 0.8056
## P-Value [Acc > NIR] : 0.8535
##
## Kappa : 0.1562
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.28571
## Specificity : 0.86207
## Pos Pred Value : 0.33333
## Neg Pred Value : 0.83333
## Prevalence : 0.19444
## Detection Rate : 0.05556
## Detection Prevalence : 0.16667
## Balanced Accuracy : 0.57389
##
## 'Positive' Class : 2
##
roc_ann_script_m <- roc(testsetALMIm$Clas_ALMI, as.vector(prob_ann_script_m), levels = c("1", "2"))
auc_ann_script_m <- auc(roc_ann_script_m)
plot(roc_ann_script_m, main = "ROC Curve - ANN Females")

# --- Males ---
ann_script_h <- nnet(Clas_IMMA ~ ., data = trainsetALMIh, size = 5, maxit = 500, decay = 5e-4, trace = FALSE)
prob_ann_script_h <- predict(ann_script_h, newdata = testsetALMIh, type = "raw")
pred_ann_script_h <- ifelse(prob_ann_script_h > 0.5, 2, 1)
pred_ann_script_h <- as.factor(pred_ann_script_h)
conf_ann_script_h <- confusionMatrix(pred_ann_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_ann_script_h)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 11 1
## 2 2 3
##
## Accuracy : 0.8235
## 95% CI : (0.5657, 0.962)
## No Information Rate : 0.7647
## P-Value [Acc > NIR] : 0.4069
##
## Kappa : 0.5487
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7500
## Specificity : 0.8462
## Pos Pred Value : 0.6000
## Neg Pred Value : 0.9167
## Prevalence : 0.2353
## Detection Rate : 0.1765
## Detection Prevalence : 0.2941
## Balanced Accuracy : 0.7981
##
## 'Positive' Class : 2
##
roc_ann_script_h <- roc(testsetALMIh$Clas_IMMA, as.vector(prob_ann_script_h), levels = c("1", "2"))
auc_ann_script_h <- auc(roc_ann_script_h)
plot(roc_ann_script_h, main = "ROC Curve - ANN Males")

library(nnet)
library(NeuralNetTools)
# Females
plotnet(ann_script_m, alpha = 0.7, circle_col = "#2F4731")
title(main = "Artificial Neural Network Diagram – Females")

# Males
plotnet(ann_script_h, alpha = 0.7, circle_col = "#3063a7")
title(main = "Artificial Neural Network Diagram – Males")

LASSO
library(glmnet)
# --- Females ---
x_script_m <- model.matrix(Clas_ALMI ~ ., data = trainsetALMIm)[,-1]
y_script_m <- as.numeric(as.character(trainsetALMIm$Clas_ALMI))
lasso_script_m <- cv.glmnet(x_script_m, y_script_m, alpha = 1, family = "binomial")
x_test_script_m <- model.matrix(Clas_ALMI ~ ., data = testsetALMIm)[,-1]
prob_lasso_script_m <- predict(lasso_script_m, s = "lambda.min", newx = x_test_script_m, type = "response")
pred_lasso_script_m <- ifelse(prob_lasso_script_m > 0.5, 2, 1)
pred_lasso_script_m <- as.factor(pred_lasso_script_m)
conf_lasso_script_m <- confusionMatrix(pred_lasso_script_m, testsetALMIm$Clas_ALMI, positive = "2")
print(conf_lasso_script_m)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 28 3
## 2 1 4
##
## Accuracy : 0.8889
## 95% CI : (0.7394, 0.9689)
## No Information Rate : 0.8056
## P-Value [Acc > NIR] : 0.1444
##
## Kappa : 0.6022
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.5714
## Specificity : 0.9655
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.9032
## Prevalence : 0.1944
## Detection Rate : 0.1111
## Detection Prevalence : 0.1389
## Balanced Accuracy : 0.7685
##
## 'Positive' Class : 2
##
roc_lasso_script_m <- roc(testsetALMIm$Clas_ALMI, as.vector(prob_lasso_script_m), levels = c("1", "2"))
auc_lasso_script_m <- auc(roc_lasso_script_m)
plot(roc_lasso_script_m, main = "ROC Curve - LASSO Females")

# --- Males ---
x_script_h <- model.matrix(Clas_IMMA ~ ., data = trainsetALMIh)[,-1]
y_script_h <- as.numeric(as.character(trainsetALMIh$Clas_IMMA))
lasso_script_h <- cv.glmnet(x_script_h, y_script_h, alpha = 1, family = "binomial")
x_test_script_h <- model.matrix(Clas_IMMA ~ ., data = testsetALMIh)[,-1]
prob_lasso_script_h <- predict(lasso_script_h, s = "lambda.min", newx = x_test_script_h, type = "response")
pred_lasso_script_h <- ifelse(prob_lasso_script_h > 0.5, 2, 1)
pred_lasso_script_h <- as.factor(pred_lasso_script_h)
conf_lasso_script_h <- confusionMatrix(pred_lasso_script_h, testsetALMIh$Clas_IMMA, positive = "2")
print(conf_lasso_script_h)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 11 1
## 2 2 3
##
## Accuracy : 0.8235
## 95% CI : (0.5657, 0.962)
## No Information Rate : 0.7647
## P-Value [Acc > NIR] : 0.4069
##
## Kappa : 0.5487
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7500
## Specificity : 0.8462
## Pos Pred Value : 0.6000
## Neg Pred Value : 0.9167
## Prevalence : 0.2353
## Detection Rate : 0.1765
## Detection Prevalence : 0.2941
## Balanced Accuracy : 0.7981
##
## 'Positive' Class : 2
##
roc_lasso_script_h <- roc(testsetALMIh$Clas_IMMA, as.vector(prob_lasso_script_h), levels = c("1", "2"))
auc_lasso_script_h <- auc(roc_lasso_script_h)
plot(roc_lasso_script_h, main = "ROC Curve - LASSO Males")

Graphical comparison of ROC curves by sex
# Females
plot(roc_glm_script_m, col="blue", lwd=2, main="ROC Curves – All models (Females)")
plot(roc_dt_script_m, col="red", add=TRUE, lwd=2)
plot(roc_script_m, col="forestgreen", add=TRUE, lwd=2) # Random Forest
plot(roc_ann_script_m, col="purple", add=TRUE, lwd=2)
plot(roc_lasso_script_m, col="orange", add=TRUE, lwd=2)
legend("bottomright",
legend=c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
col=c("blue", "red", "forestgreen", "purple", "orange"), lwd=2, cex=0.8)

# Males
plot(roc_glm_script_h, col="blue", lwd=2, main="ROC Curves – All models (Males)")
plot(roc_dt_script_h, col="red", add=TRUE, lwd=2)
plot(roc_script_h, col="forestgreen", add=TRUE, lwd=2) # Random Forest
plot(roc_ann_script_h, col="purple", add=TRUE, lwd=2)
plot(roc_lasso_script_h, col="orange", add=TRUE, lwd=2)
legend("bottomright",
legend=c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
col=c("blue", "red", "forestgreen", "purple", "orange"), lwd=2, cex=0.8)

Summary metrics table
# Females
tab_m <- data.frame(
Modelo = c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
Accuracy = c(conf_glm_script_m$overall["Accuracy"], conf_dt_script_m$overall["Accuracy"], conf_mtx_script_m$overall["Accuracy"], conf_ann_script_m$overall["Accuracy"], conf_lasso_script_m$overall["Accuracy"]),
Sensitivity = c(conf_glm_script_m$byClass["Sensitivity"], conf_dt_script_m$byClass["Sensitivity"], conf_mtx_script_m$byClass["Sensitivity"], conf_ann_script_m$byClass["Sensitivity"], conf_lasso_script_m$byClass["Sensitivity"]),
Specificity = c(conf_glm_script_m$byClass["Specificity"], conf_dt_script_m$byClass["Specificity"], conf_mtx_script_m$byClass["Specificity"], conf_ann_script_m$byClass["Specificity"], conf_lasso_script_m$byClass["Specificity"]),
Precision = c(conf_glm_script_m$byClass["Precision"], conf_dt_script_m$byClass["Precision"], conf_mtx_script_m$byClass["Precision"], conf_ann_script_m$byClass["Precision"], conf_lasso_script_m$byClass["Precision"]),
AUC = c(auc_glm_script_m, auc_dt_script_m, auc_script_m, auc_ann_script_m, auc_lasso_script_m)
)
knitr::kable(tab_m, digits=3, caption="Table 1. Performance of ML models for the prediction of low ALMI in females")
Table 1. Performance of ML models for the prediction of low
ALMI in females
| Logistic Regression |
0.861 |
0.571 |
0.931 |
0.667 |
0.761 |
| Decision Tree |
0.917 |
0.571 |
1.000 |
1.000 |
0.835 |
| Random Forest |
0.861 |
0.571 |
0.931 |
0.667 |
0.815 |
| ANN |
0.750 |
0.286 |
0.862 |
0.333 |
0.847 |
| LASSO |
0.889 |
0.571 |
0.966 |
0.800 |
0.842 |
# Males
tab_h <- data.frame(
Modelo = c("Logistic Regression", "Decision Tree", "Random Forest", "ANN", "LASSO"),
Accuracy = c(conf_glm_script_h$overall["Accuracy"], conf_dt_script_h$overall["Accuracy"], conf_mtx_script_h$overall["Accuracy"], conf_ann_script_h$overall["Accuracy"], conf_lasso_script_h$overall["Accuracy"]),
Sensitivity = c(conf_glm_script_h$byClass["Sensitivity"], conf_dt_script_h$byClass["Sensitivity"], conf_mtx_script_h$byClass["Sensitivity"], conf_ann_script_h$byClass["Sensitivity"], conf_lasso_script_h$byClass["Sensitivity"]),
Specificity = c(conf_glm_script_h$byClass["Specificity"], conf_dt_script_h$byClass["Specificity"], conf_mtx_script_h$byClass["Specificity"], conf_ann_script_h$byClass["Specificity"], conf_lasso_script_h$byClass["Specificity"]),
Precision = c(conf_glm_script_h$byClass["Precision"], conf_dt_script_h$byClass["Precision"], conf_mtx_script_h$byClass["Precision"], conf_ann_script_h$byClass["Precision"], conf_lasso_script_h$byClass["Precision"]),
AUC = c(auc_glm_script_h, auc_dt_script_h, auc_script_h, auc_ann_script_h, auc_lasso_script_h)
)
knitr::kable(tab_h, digits=3, caption="Table 2. Performance of ML models for the prediction of low ALMI in males")
Table 2. Performance of ML models for the prediction of low
ALMI in males
| Logistic Regression |
0.765 |
0.75 |
0.769 |
0.50 |
0.769 |
| Decision Tree |
0.824 |
0.75 |
0.846 |
0.60 |
0.798 |
| Random Forest |
0.882 |
0.75 |
0.923 |
0.75 |
0.788 |
| ANN |
0.824 |
0.75 |
0.846 |
0.60 |
0.923 |
| LASSO |
0.824 |
0.75 |
0.846 |
0.60 |
0.865 |