# Instalar y cargar las librerías necesarias
paquetes <- c("readxl", "caret", "rpart", "rpart.plot", "randomForest",
              "e1071", "xgboost", "pROC", "ggplot2", "ROCR", "dplyr")

instalar <- paquetes[!(paquetes %in% installed.packages()[,"Package"])]
if(length(instalar)) install.packages(instalar, dependencies = TRUE)

lapply(paquetes, library, character.only = TRUE)
## [[1]]
## [1] "readxl"    "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
##  [1] "caret"     "lattice"   "ggplot2"   "readxl"    "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "rpart"     "caret"     "lattice"   "ggplot2"   "readxl"    "stats"    
##  [7] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "rpart.plot" "rpart"      "caret"      "lattice"    "ggplot2"   
##  [6] "readxl"     "stats"      "graphics"   "grDevices"  "utils"     
## [11] "datasets"   "methods"    "base"      
## 
## [[5]]
##  [1] "randomForest" "rpart.plot"   "rpart"        "caret"        "lattice"     
##  [6] "ggplot2"      "readxl"       "stats"        "graphics"     "grDevices"   
## [11] "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "e1071"        "randomForest" "rpart.plot"   "rpart"        "caret"       
##  [6] "lattice"      "ggplot2"      "readxl"       "stats"        "graphics"    
## [11] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "xgboost"      "e1071"        "randomForest" "rpart.plot"   "rpart"       
##  [6] "caret"        "lattice"      "ggplot2"      "readxl"       "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[8]]
##  [1] "pROC"         "xgboost"      "e1071"        "randomForest" "rpart.plot"  
##  [6] "rpart"        "caret"        "lattice"      "ggplot2"      "readxl"      
## [11] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [16] "methods"      "base"        
## 
## [[9]]
##  [1] "pROC"         "xgboost"      "e1071"        "randomForest" "rpart.plot"  
##  [6] "rpart"        "caret"        "lattice"      "ggplot2"      "readxl"      
## [11] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [16] "methods"      "base"        
## 
## [[10]]
##  [1] "ROCR"         "pROC"         "xgboost"      "e1071"        "randomForest"
##  [6] "rpart.plot"   "rpart"        "caret"        "lattice"      "ggplot2"     
## [11] "readxl"       "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "dplyr"        "ROCR"         "pROC"         "xgboost"      "e1071"       
##  [6] "randomForest" "rpart.plot"   "rpart"        "caret"        "lattice"     
## [11] "ggplot2"      "readxl"       "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"

Carga de datos

datos <- read_excel("C:/Users/user/Downloads/credit_risk.xlsx")
datos$Default <- as.factor(datos$Default)

# Asegurar que ciertas variables sean numéricas
numericas <- c("Age", "Income", "Amount", "Rate", "Percent_income", "Cred_length")
datos[numericas] <- lapply(datos[numericas], as.numeric)

str(datos)
## tibble [32,581 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Id            : num [1:32581] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Age           : num [1:32581] 22 21 25 23 24 21 26 24 24 21 ...
##  $ Income        : num [1:32581] 59000 9600 9600 65500 54400 ...
##  $ Home          : chr [1:32581] "RENT" "OWN" "MORTGAGE" "RENT" ...
##  $ Emp_length    : num [1:32581] 123 5 1 4 8 2 8 5 8 6 ...
##  $ Intent        : chr [1:32581] "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
##  $ Amount        : num [1:32581] 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
##  $ Rate          : num [1:32581] 45704 11.1 12.9 15.2 14.3 ...
##  $ Status        : num [1:32581] 1 0 1 1 1 1 1 1 1 1 ...
##  $ Percent_income: num [1:32581] 0.59 0.1 0.57 0.53 0.55 0.25 0.45 0.44 0.42 0.16 ...
##  $ Default       : Factor w/ 2 levels "N","Y": 2 1 1 1 2 1 1 1 1 1 ...
##  $ Cred_length   : num [1:32581] 3 2 3 2 4 2 3 4 2 3 ...
summary(datos)
##        Id             Age             Income            Home          
##  Min.   :    0   Min.   : 20.00   Min.   :   4000   Length:32581      
##  1st Qu.: 8145   1st Qu.: 23.00   1st Qu.:  38500   Class :character  
##  Median :16290   Median : 26.00   Median :  55000   Mode  :character  
##  Mean   :16290   Mean   : 27.73   Mean   :  66075                     
##  3rd Qu.:24435   3rd Qu.: 30.00   3rd Qu.:  79200                     
##  Max.   :32780   Max.   :144.00   Max.   :6000000                     
##                                                                       
##    Emp_length        Intent              Amount           Rate         
##  Min.   :  0.00   Length:32581       Min.   :  500   Min.   :    5.42  
##  1st Qu.:  2.00   Class :character   1st Qu.: 5000   1st Qu.:    8.59  
##  Median :  4.00   Mode  :character   Median : 8000   Median :   11.66  
##  Mean   :  4.79                      Mean   : 9589   Mean   : 4906.73  
##  3rd Qu.:  7.00                      3rd Qu.:12200   3rd Qu.:   14.42  
##  Max.   :123.00                      Max.   :35000   Max.   :46005.00  
##  NA's   :895                                         NA's   :3116      
##      Status       Percent_income   Default    Cred_length    
##  Min.   :0.0000   Min.   :0.0000   N:26836   Min.   : 2.000  
##  1st Qu.:0.0000   1st Qu.:0.0900   Y: 5745   1st Qu.: 3.000  
##  Median :0.0000   Median :0.1500             Median : 4.000  
##  Mean   :0.2182   Mean   :0.1702             Mean   : 5.804  
##  3rd Qu.:0.0000   3rd Qu.:0.2300             3rd Qu.: 8.000  
##  Max.   :1.0000   Max.   :0.8300             Max.   :30.000  
## 

Partición en train y test

set.seed(123)
inTrain <- createDataPartition(y = datos$Default, p = 0.7, list = FALSE)
train <- datos[inTrain, ]
test <- datos[-inTrain, ]

Limpieza de datos con NA

train <- na.omit(train)
test <- na.omit(test)

Árbol de Decisión

modelo_arbol <- rpart(Default ~ ., data = train, method = "class")
rpart.plot(modelo_arbol)

pred_arbol <- predict(modelo_arbol, test, type = "class")
conf_matrix_arbol <- confusionMatrix(pred_arbol, test$Default)
prob_arbol <- predict(modelo_arbol, test, type = "prob")[,2]
roc_arbol <- roc(test$Default, prob_arbol)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc_arbol, main="ROC Árbol de Decisión")

Random Forest

modelo_rf <- randomForest(Default ~ ., data = train, ntree = 100)
pred_rf <- predict(modelo_rf, test)
conf_matrix_rf <- confusionMatrix(pred_rf, test$Default)
prob_rf <- predict(modelo_rf, test, type = "prob")[,2]
roc_rf <- roc(test$Default, prob_rf)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc_rf, main="ROC Random Forest")

SVM

modelo_svm <- svm(Default ~ ., data = train, kernel = "radial", probability = TRUE)
pred_svm <- predict(modelo_svm, test)
conf_matrix_svm <- confusionMatrix(pred_svm, test$Default)
prob_svm <- attr(predict(modelo_svm, test, probability = TRUE), "probabilities")[,2]
roc_svm <- roc(test$Default, prob_svm)
## Setting levels: control = N, case = Y
## Setting direction: controls > cases
plot(roc_svm, main="ROC SVM")

XGBoost

train_matrix <- model.matrix(Default ~ . -1, data = train)
test_matrix <- model.matrix(Default ~ . -1, data = test)
train_label <- ifelse(train$Default == "Yes", 1, 0)
test_label <- ifelse(test$Default == "Yes", 1, 0)

dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
dtest <- xgb.DMatrix(data = test_matrix, label = test_label)

modelo_xgb <- xgboost(data = dtrain, nrounds = 100, objective = "binary:logistic", verbose = 0)
prob_xgb <- predict(modelo_xgb, dtest)
pred_xgb <- ifelse(prob_xgb > 0.5, "Yes", "No")
conf_matrix_xgb <- confusionMatrix(factor(pred_xgb, levels = levels(test$Default)), test$Default)
roc_xgb <- roc(test$Default, prob_xgb)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc_xgb, main="ROC XGBoost")

Comparación de métricas

resultados <- data.frame(
  Modelo = c("Árbol", "Random Forest", "SVM", "XGBoost"),
  Accuracy = c(conf_matrix_arbol$overall["Accuracy"],
               conf_matrix_rf$overall["Accuracy"],
               conf_matrix_svm$overall["Accuracy"],
               conf_matrix_xgb$overall["Accuracy"]),
  Sensitivity = c(conf_matrix_arbol$byClass["Sensitivity"],
                  conf_matrix_rf$byClass["Sensitivity"],
                  conf_matrix_svm$byClass["Sensitivity"],
                  conf_matrix_xgb$byClass["Sensitivity"]),
  Specificity = c(conf_matrix_arbol$byClass["Specificity"],
                  conf_matrix_rf$byClass["Specificity"],
                  conf_matrix_svm$byClass["Specificity"],
                  conf_matrix_xgb$byClass["Specificity"]),
  F1 = c(conf_matrix_arbol$byClass["F1"],
         conf_matrix_rf$byClass["F1"],
         conf_matrix_svm$byClass["F1"],
         conf_matrix_xgb$byClass["F1"])
)
print(resultados)
##          Modelo  Accuracy Sensitivity Specificity        F1
## 1         Árbol 0.8219353   0.9337110   0.3088427 0.8959565
## 2 Random Forest 0.8208886   0.9191218   0.3699610 0.8939248
## 3           SVM 0.8211212   1.0000000   0.0000000 0.9017755
## 4       XGBoost       NaN          NA          NA        NA