library(rpart) # Para árboles de decisión básicoslibrary(rpart.plot) # Para visualización mejorada#install.packages(c("tree","randomForest","caret","dplyr"))library(tree) # Alternativa para árboles
Warning: package 'tree' was built under R version 4.5.1
library(randomForest) # Para random forest
Warning: package 'randomForest' was built under R version 4.5.1
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
library(caret) # Para validación cruzada y métricas
Warning: package 'caret' was built under R version 4.5.1
Cargando paquete requerido: ggplot2
Adjuntando el paquete: 'ggplot2'
The following object is masked from 'package:randomForest':
margin
Cargando paquete requerido: lattice
library(dplyr)
Warning: package 'dplyr' was built under R version 4.5.1
Adjuntando el paquete: 'dplyr'
The following object is masked from 'package:randomForest':
combine
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
set.seed(123)n <-1000credito_data <-data.frame(ingreso =rnorm(n, 50000, 20000),edad =sample(18:70, n, replace =TRUE),historial_credito =sample(c("Excelente", "Bueno", "Regular", "Malo"), n, replace =TRUE, prob =c(0.2, 0.3, 0.3, 0.2)), empleo_años =sample(0:30, n, replace =TRUE),deuda_actual =rnorm(n, 15000, 10000),educacion =sample(c("Secundaria", "Técnico", "Universitario", "Posgrado"), n,replace =TRUE, prob =c(0.3, 0.25, 0.35, 0.1)))# Crear variable dependiente (aprobación) basada en lógica económicacredito_data$ratio_deuda_ingreso <- credito_data$deuda_actual / credito_data$ingresocredito_data$aprobado <-with(credito_data, ifelse(ingreso >40000& historial_credito %in%c("Excelente", "Bueno") & ratio_deuda_ingreso <0.4& empleo_años >2, "Sí", "No"))# Introducir algo de variabilidadcredito_data$aprobado[sample(1:n, n*0.1)] <-ifelse(credito_data$aprobado[sample(1:n, n*0.1)] =="Sí", "No", "Sí")# Dividir en entrenamiento y pruebatrain_idx <-sample(1:n, 0.7*n)train_data <- credito_data[train_idx, ]test_data <- credito_data[-train_idx, ]# ====================================================================# MODELO 1: ÁRBOL DE CLASIFICACIÓN BÁSICO# ====================================================================# Crear el árbol de decisiónarbol_credito <-rpart(aprobado ~ ingreso + edad + historial_credito + empleo_años + deuda_actual + educacion + ratio_deuda_ingreso,data = train_data,method ="class",control =rpart.control(minsplit =20, cp =0.01))# Visualizar el árbolrpart.plot(arbol_credito, main ="Árbol de Decisión: Aprobación de Crédito",extra =104, # Mostrar probabilidades y conteosfallen.leaves =TRUE,cex =0.8)
# Resumen del modelosummary(arbol_credito)
Call:
rpart(formula = aprobado ~ ingreso + edad + historial_credito +
empleo_años + deuda_actual + educacion + ratio_deuda_ingreso,
data = train_data, method = "class", control = rpart.control(minsplit = 20,
cp = 0.01))
n= 700
CP nsplit rel error xerror xstd
1 0.2525 0 1.000 1.000 0.05976143
2 0.1650 2 0.495 0.515 0.04686264
3 0.1200 3 0.330 0.365 0.04043116
4 0.0100 4 0.210 0.235 0.03310751
Variable importance
ratio_deuda_ingreso ingreso historial_credito empleo_años
27 23 21 15
deuda_actual edad
13 1
Node number 1: 700 observations, complexity param=0.2525
predicted class=No expected loss=0.2857143 P(node) =1
class counts: 500 200
probabilities: 0.714 0.286
left son=2 (339 obs) right son=3 (361 obs)
Primary splits:
historial_credito splits as RRLL, improve=59.069190, (0 missing)
ratio_deuda_ingreso < 0.400619 to the right, improve=40.057490, (0 missing)
ingreso < 42122.85 to the left, improve=32.905290, (0 missing)
deuda_actual < 18179.06 to the right, improve=13.319490, (0 missing)
empleo_años < 3.5 to the left, improve= 6.643706, (0 missing)
Surrogate splits:
empleo_años < 24.5 to the right, agree=0.550, adj=0.071, (0 split)
edad < 24.5 to the left, agree=0.531, adj=0.032, (0 split)
ratio_deuda_ingreso < 0.552502 to the left, agree=0.530, adj=0.029, (0 split)
deuda_actual < 12831.26 to the left, agree=0.529, adj=0.027, (0 split)
educacion splits as LLRR, agree=0.523, adj=0.015, (0 split)
Node number 2: 339 observations
predicted class=No expected loss=0.07374631 P(node) =0.4842857
class counts: 314 25
probabilities: 0.926 0.074
Node number 3: 361 observations, complexity param=0.2525
predicted class=No expected loss=0.4847645 P(node) =0.5157143
class counts: 186 175
probabilities: 0.515 0.485
left son=6 (132 obs) right son=7 (229 obs)
Primary splits:
ratio_deuda_ingreso < 0.3881575 to the right, improve=69.62049, (0 missing)
ingreso < 40067.7 to the left, improve=61.06584, (0 missing)
deuda_actual < 18177.32 to the right, improve=21.59782, (0 missing)
empleo_años < 2.5 to the left, improve=19.34073, (0 missing)
edad < 46.5 to the left, improve= 4.03979, (0 missing)
Surrogate splits:
deuda_actual < 17779.45 to the right, agree=0.814, adj=0.492, (0 split)
ingreso < 38067.43 to the left, agree=0.773, adj=0.379, (0 split)
Node number 6: 132 observations
predicted class=No expected loss=0.07575758 P(node) =0.1885714
class counts: 122 10
probabilities: 0.924 0.076
Node number 7: 229 observations, complexity param=0.165
predicted class=Sí expected loss=0.279476 P(node) =0.3271429
class counts: 64 165
probabilities: 0.279 0.721
left son=14 (33 obs) right son=15 (196 obs)
Primary splits:
ingreso < 40067.7 to the left, improve=40.033200, (0 missing)
empleo_años < 2.5 to the left, improve=34.964930, (0 missing)
ratio_deuda_ingreso < -0.1726689 to the left, improve= 5.879789, (0 missing)
edad < 29.5 to the left, improve= 5.766968, (0 missing)
deuda_actual < 14656.82 to the left, improve= 2.680308, (0 missing)
Surrogate splits:
ratio_deuda_ingreso < -0.1924255 to the left, agree=0.873, adj=0.121, (0 split)
Node number 14: 33 observations
predicted class=No expected loss=0 P(node) =0.04714286
class counts: 33 0
probabilities: 1.000 0.000
Node number 15: 196 observations, complexity param=0.12
predicted class=Sí expected loss=0.1581633 P(node) =0.28
class counts: 31 165
probabilities: 0.158 0.842
left son=30 (28 obs) right son=31 (168 obs)
Primary splits:
empleo_años < 2.5 to the left, improve=38.7772100, (0 missing)
edad < 29.5 to the left, improve= 2.6184250, (0 missing)
ingreso < 54326.92 to the right, improve= 1.2175520, (0 missing)
ratio_deuda_ingreso < 0.04125637 to the right, improve= 0.9188392, (0 missing)
deuda_actual < 2175.068 to the right, improve= 0.8590359, (0 missing)
Surrogate splits:
edad < 18.5 to the left, agree=0.862, adj=0.036, (0 split)
Node number 30: 28 observations
predicted class=No expected loss=0.07142857 P(node) =0.04
class counts: 26 2
probabilities: 0.929 0.071
Node number 31: 168 observations
predicted class=Sí expected loss=0.0297619 P(node) =0.24
class counts: 5 163
probabilities: 0.030 0.970
# Importancia de variablesprint("Importancia de Variables:")
# Prediccionespredicciones <-predict(arbol_credito, test_data, type ="class")matriz_confusion <-table(test_data$aprobado, predicciones)print("Matriz de Confusión:")
[1] "Matriz de Confusión:"
print(matriz_confusion)
predicciones
No Sí
No 205 4
Sí 21 70
# Precisiónprecision <-sum(diag(matriz_confusion)) /sum(matriz_confusion)print(paste("Precisión del modelo:", round(precision, 3)))