# Instalar y cargar las librerías necesarias
paquetes <- c("readxl", "caret", "rpart", "rpart.plot", "randomForest",
"e1071", "xgboost", "pROC", "ggplot2", "ROCR", "dplyr")
instalar <- paquetes[!(paquetes %in% installed.packages()[,"Package"])]
if(length(instalar)) install.packages(instalar, dependencies = TRUE)
lapply(paquetes, library, character.only = TRUE)
## [[1]]
## [1] "readxl" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "caret" "lattice" "ggplot2" "readxl" "stats" "graphics"
## [7] "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "rpart" "caret" "lattice" "ggplot2" "readxl" "stats"
## [7] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "rpart.plot" "rpart" "caret" "lattice" "ggplot2"
## [6] "readxl" "stats" "graphics" "grDevices" "utils"
## [11] "datasets" "methods" "base"
##
## [[5]]
## [1] "randomForest" "rpart.plot" "rpart" "caret" "lattice"
## [6] "ggplot2" "readxl" "stats" "graphics" "grDevices"
## [11] "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "e1071" "randomForest" "rpart.plot" "rpart" "caret"
## [6] "lattice" "ggplot2" "readxl" "stats" "graphics"
## [11] "grDevices" "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "xgboost" "e1071" "randomForest" "rpart.plot" "rpart"
## [6] "caret" "lattice" "ggplot2" "readxl" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[8]]
## [1] "pROC" "xgboost" "e1071" "randomForest" "rpart.plot"
## [6] "rpart" "caret" "lattice" "ggplot2" "readxl"
## [11] "stats" "graphics" "grDevices" "utils" "datasets"
## [16] "methods" "base"
##
## [[9]]
## [1] "pROC" "xgboost" "e1071" "randomForest" "rpart.plot"
## [6] "rpart" "caret" "lattice" "ggplot2" "readxl"
## [11] "stats" "graphics" "grDevices" "utils" "datasets"
## [16] "methods" "base"
##
## [[10]]
## [1] "ROCR" "pROC" "xgboost" "e1071" "randomForest"
## [6] "rpart.plot" "rpart" "caret" "lattice" "ggplot2"
## [11] "readxl" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[11]]
## [1] "dplyr" "ROCR" "pROC" "xgboost" "e1071"
## [6] "randomForest" "rpart.plot" "rpart" "caret" "lattice"
## [11] "ggplot2" "readxl" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
Carga de datos
datos <- read_excel("C:/Users/user/Downloads/credit_risk.xlsx")
datos$Default <- as.factor(datos$Default)
# Asegurar que ciertas variables sean numéricas
numericas <- c("Age", "Income", "Amount", "Rate", "Percent_income", "Cred_length")
datos[numericas] <- lapply(datos[numericas], as.numeric)
str(datos)
## tibble [32,581 × 12] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:32581] 0 1 2 3 4 5 6 7 8 9 ...
## $ Age : num [1:32581] 22 21 25 23 24 21 26 24 24 21 ...
## $ Income : num [1:32581] 59000 9600 9600 65500 54400 ...
## $ Home : chr [1:32581] "RENT" "OWN" "MORTGAGE" "RENT" ...
## $ Emp_length : num [1:32581] 123 5 1 4 8 2 8 5 8 6 ...
## $ Intent : chr [1:32581] "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
## $ Amount : num [1:32581] 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
## $ Rate : num [1:32581] 45704 11.1 12.9 15.2 14.3 ...
## $ Status : num [1:32581] 1 0 1 1 1 1 1 1 1 1 ...
## $ Percent_income: num [1:32581] 0.59 0.1 0.57 0.53 0.55 0.25 0.45 0.44 0.42 0.16 ...
## $ Default : Factor w/ 2 levels "N","Y": 2 1 1 1 2 1 1 1 1 1 ...
## $ Cred_length : num [1:32581] 3 2 3 2 4 2 3 4 2 3 ...
summary(datos)
## Id Age Income Home
## Min. : 0 Min. : 20.00 Min. : 4000 Length:32581
## 1st Qu.: 8145 1st Qu.: 23.00 1st Qu.: 38500 Class :character
## Median :16290 Median : 26.00 Median : 55000 Mode :character
## Mean :16290 Mean : 27.73 Mean : 66075
## 3rd Qu.:24435 3rd Qu.: 30.00 3rd Qu.: 79200
## Max. :32780 Max. :144.00 Max. :6000000
##
## Emp_length Intent Amount Rate
## Min. : 0.00 Length:32581 Min. : 500 Min. : 5.42
## 1st Qu.: 2.00 Class :character 1st Qu.: 5000 1st Qu.: 8.59
## Median : 4.00 Mode :character Median : 8000 Median : 11.66
## Mean : 4.79 Mean : 9589 Mean : 4906.73
## 3rd Qu.: 7.00 3rd Qu.:12200 3rd Qu.: 14.42
## Max. :123.00 Max. :35000 Max. :46005.00
## NA's :895 NA's :3116
## Status Percent_income Default Cred_length
## Min. :0.0000 Min. :0.0000 N:26836 Min. : 2.000
## 1st Qu.:0.0000 1st Qu.:0.0900 Y: 5745 1st Qu.: 3.000
## Median :0.0000 Median :0.1500 Median : 4.000
## Mean :0.2182 Mean :0.1702 Mean : 5.804
## 3rd Qu.:0.0000 3rd Qu.:0.2300 3rd Qu.: 8.000
## Max. :1.0000 Max. :0.8300 Max. :30.000
##
Partición en train y test
set.seed(123)
inTrain <- createDataPartition(y = datos$Default, p = 0.7, list = FALSE)
train <- datos[inTrain, ]
test <- datos[-inTrain, ]
Limpieza de datos con NA
train <- na.omit(train)
test <- na.omit(test)
Árbol de Decisión
modelo_arbol <- rpart(Default ~ ., data = train, method = "class")
rpart.plot(modelo_arbol)

pred_arbol <- predict(modelo_arbol, test, type = "class")
conf_matrix_arbol <- confusionMatrix(pred_arbol, test$Default)
prob_arbol <- predict(modelo_arbol, test, type = "prob")[,2]
roc_arbol <- roc(test$Default, prob_arbol)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc_arbol, main="ROC Árbol de Decisión")

Random Forest
modelo_rf <- randomForest(Default ~ ., data = train, ntree = 100)
pred_rf <- predict(modelo_rf, test)
conf_matrix_rf <- confusionMatrix(pred_rf, test$Default)
prob_rf <- predict(modelo_rf, test, type = "prob")[,2]
roc_rf <- roc(test$Default, prob_rf)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc_rf, main="ROC Random Forest")

SVM
modelo_svm <- svm(Default ~ ., data = train, kernel = "radial", probability = TRUE)
pred_svm <- predict(modelo_svm, test)
conf_matrix_svm <- confusionMatrix(pred_svm, test$Default)
prob_svm <- attr(predict(modelo_svm, test, probability = TRUE), "probabilities")[,2]
roc_svm <- roc(test$Default, prob_svm)
## Setting levels: control = N, case = Y
## Setting direction: controls > cases
plot(roc_svm, main="ROC SVM")

XGBoost
train_matrix <- model.matrix(Default ~ . -1, data = train)
test_matrix <- model.matrix(Default ~ . -1, data = test)
train_label <- ifelse(train$Default == "Yes", 1, 0)
test_label <- ifelse(test$Default == "Yes", 1, 0)
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
dtest <- xgb.DMatrix(data = test_matrix, label = test_label)
modelo_xgb <- xgboost(data = dtrain, nrounds = 100, objective = "binary:logistic", verbose = 0)
prob_xgb <- predict(modelo_xgb, dtest)
pred_xgb <- ifelse(prob_xgb > 0.5, "Yes", "No")
conf_matrix_xgb <- confusionMatrix(factor(pred_xgb, levels = levels(test$Default)), test$Default)
roc_xgb <- roc(test$Default, prob_xgb)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc_xgb, main="ROC XGBoost")

Comparación de métricas
resultados <- data.frame(
Modelo = c("Árbol", "Random Forest", "SVM", "XGBoost"),
Accuracy = c(conf_matrix_arbol$overall["Accuracy"],
conf_matrix_rf$overall["Accuracy"],
conf_matrix_svm$overall["Accuracy"],
conf_matrix_xgb$overall["Accuracy"]),
Sensitivity = c(conf_matrix_arbol$byClass["Sensitivity"],
conf_matrix_rf$byClass["Sensitivity"],
conf_matrix_svm$byClass["Sensitivity"],
conf_matrix_xgb$byClass["Sensitivity"]),
Specificity = c(conf_matrix_arbol$byClass["Specificity"],
conf_matrix_rf$byClass["Specificity"],
conf_matrix_svm$byClass["Specificity"],
conf_matrix_xgb$byClass["Specificity"]),
F1 = c(conf_matrix_arbol$byClass["F1"],
conf_matrix_rf$byClass["F1"],
conf_matrix_svm$byClass["F1"],
conf_matrix_xgb$byClass["F1"])
)
print(resultados)
## Modelo Accuracy Sensitivity Specificity F1
## 1 Árbol 0.8219353 0.9337110 0.3088427 0.8959565
## 2 Random Forest 0.8208886 0.9191218 0.3699610 0.8939248
## 3 SVM 0.8211212 1.0000000 0.0000000 0.9017755
## 4 XGBoost NaN NA NA NA