ARBOL DE DECISION

library(rpart)        # Para árboles de decisión básicos
library(rpart.plot)   # Para visualización mejorada
#install.packages(c("tree","randomForest","caret","dplyr"))
library(tree)         # Alternativa para árboles
Warning: package 'tree' was built under R version 4.5.1
library(randomForest) # Para random forest
Warning: package 'randomForest' was built under R version 4.5.1
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
library(caret)        # Para validación cruzada y métricas
Warning: package 'caret' was built under R version 4.5.1
Cargando paquete requerido: ggplot2

Adjuntando el paquete: 'ggplot2'
The following object is masked from 'package:randomForest':

    margin
Cargando paquete requerido: lattice
library(dplyr) 
Warning: package 'dplyr' was built under R version 4.5.1

Adjuntando el paquete: 'dplyr'
The following object is masked from 'package:randomForest':

    combine
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
set.seed(123)
n <- 1000

credito_data <- data.frame(
  ingreso = rnorm(n, 50000, 20000),
  edad = sample(18:70, n, replace = TRUE),
  historial_credito = sample(c("Excelente", "Bueno", "Regular", "Malo"), n, 
                             replace = TRUE, prob = c(0.2, 0.3, 0.3, 0.2)),
  empleo_años = sample(0:30, n, replace = TRUE),
  deuda_actual = rnorm(n, 15000, 10000),
  educacion = sample(c("Secundaria", "Técnico", "Universitario", "Posgrado"), n,
                     replace = TRUE, prob = c(0.3, 0.25, 0.35, 0.1))
)

# Crear variable dependiente (aprobación) basada en lógica económica
credito_data$ratio_deuda_ingreso <- credito_data$deuda_actual / credito_data$ingreso
credito_data$aprobado <- with(credito_data, 
                              ifelse(ingreso > 40000 & 
                                       historial_credito %in% c("Excelente", "Bueno") &
                                       ratio_deuda_ingreso < 0.4 &
                                       empleo_años > 2, "Sí", "No"))

# Introducir algo de variabilidad
credito_data$aprobado[sample(1:n, n*0.1)] <- 
  ifelse(credito_data$aprobado[sample(1:n, n*0.1)] == "Sí", "No", "Sí")

# Dividir en entrenamiento y prueba
train_idx <- sample(1:n, 0.7*n)
train_data <- credito_data[train_idx, ]
test_data <- credito_data[-train_idx, ]

# ====================================================================
# MODELO 1: ÁRBOL DE CLASIFICACIÓN BÁSICO
# ====================================================================

# Crear el árbol de decisión
arbol_credito <- rpart(aprobado ~ ingreso + edad + historial_credito + 
                         empleo_años + deuda_actual + educacion + ratio_deuda_ingreso,
                       data = train_data,
                       method = "class",
                       control = rpart.control(minsplit = 20, cp = 0.01))

# Visualizar el árbol
rpart.plot(arbol_credito, 
           main = "Árbol de Decisión: Aprobación de Crédito",
           extra = 104,  # Mostrar probabilidades y conteos
           fallen.leaves = TRUE,
           cex = 0.8)

# Resumen del modelo
summary(arbol_credito)
Call:
rpart(formula = aprobado ~ ingreso + edad + historial_credito + 
    empleo_años + deuda_actual + educacion + ratio_deuda_ingreso, 
    data = train_data, method = "class", control = rpart.control(minsplit = 20, 
        cp = 0.01))
  n= 700 

      CP nsplit rel error xerror       xstd
1 0.2525      0     1.000  1.000 0.05976143
2 0.1650      2     0.495  0.515 0.04686264
3 0.1200      3     0.330  0.365 0.04043116
4 0.0100      4     0.210  0.235 0.03310751

Variable importance
ratio_deuda_ingreso             ingreso   historial_credito         empleo_años 
                 27                  23                  21                  15 
       deuda_actual                edad 
                 13                   1 

Node number 1: 700 observations,    complexity param=0.2525
  predicted class=No  expected loss=0.2857143  P(node) =1
    class counts:   500   200
   probabilities: 0.714 0.286 
  left son=2 (339 obs) right son=3 (361 obs)
  Primary splits:
      historial_credito   splits as  RRLL, improve=59.069190, (0 missing)
      ratio_deuda_ingreso < 0.400619   to the right, improve=40.057490, (0 missing)
      ingreso             < 42122.85   to the left,  improve=32.905290, (0 missing)
      deuda_actual        < 18179.06   to the right, improve=13.319490, (0 missing)
      empleo_años         < 3.5        to the left,  improve= 6.643706, (0 missing)
  Surrogate splits:
      empleo_años         < 24.5       to the right, agree=0.550, adj=0.071, (0 split)
      edad                < 24.5       to the left,  agree=0.531, adj=0.032, (0 split)
      ratio_deuda_ingreso < 0.552502   to the left,  agree=0.530, adj=0.029, (0 split)
      deuda_actual        < 12831.26   to the left,  agree=0.529, adj=0.027, (0 split)
      educacion           splits as  LLRR, agree=0.523, adj=0.015, (0 split)

Node number 2: 339 observations
  predicted class=No  expected loss=0.07374631  P(node) =0.4842857
    class counts:   314    25
   probabilities: 0.926 0.074 

Node number 3: 361 observations,    complexity param=0.2525
  predicted class=No  expected loss=0.4847645  P(node) =0.5157143
    class counts:   186   175
   probabilities: 0.515 0.485 
  left son=6 (132 obs) right son=7 (229 obs)
  Primary splits:
      ratio_deuda_ingreso < 0.3881575  to the right, improve=69.62049, (0 missing)
      ingreso             < 40067.7    to the left,  improve=61.06584, (0 missing)
      deuda_actual        < 18177.32   to the right, improve=21.59782, (0 missing)
      empleo_años         < 2.5        to the left,  improve=19.34073, (0 missing)
      edad                < 46.5       to the left,  improve= 4.03979, (0 missing)
  Surrogate splits:
      deuda_actual < 17779.45   to the right, agree=0.814, adj=0.492, (0 split)
      ingreso      < 38067.43   to the left,  agree=0.773, adj=0.379, (0 split)

Node number 6: 132 observations
  predicted class=No  expected loss=0.07575758  P(node) =0.1885714
    class counts:   122    10
   probabilities: 0.924 0.076 

Node number 7: 229 observations,    complexity param=0.165
  predicted class=Sí  expected loss=0.279476  P(node) =0.3271429
    class counts:    64   165
   probabilities: 0.279 0.721 
  left son=14 (33 obs) right son=15 (196 obs)
  Primary splits:
      ingreso             < 40067.7    to the left,  improve=40.033200, (0 missing)
      empleo_años         < 2.5        to the left,  improve=34.964930, (0 missing)
      ratio_deuda_ingreso < -0.1726689 to the left,  improve= 5.879789, (0 missing)
      edad                < 29.5       to the left,  improve= 5.766968, (0 missing)
      deuda_actual        < 14656.82   to the left,  improve= 2.680308, (0 missing)
  Surrogate splits:
      ratio_deuda_ingreso < -0.1924255 to the left,  agree=0.873, adj=0.121, (0 split)

Node number 14: 33 observations
  predicted class=No  expected loss=0  P(node) =0.04714286
    class counts:    33     0
   probabilities: 1.000 0.000 

Node number 15: 196 observations,    complexity param=0.12
  predicted class=Sí  expected loss=0.1581633  P(node) =0.28
    class counts:    31   165
   probabilities: 0.158 0.842 
  left son=30 (28 obs) right son=31 (168 obs)
  Primary splits:
      empleo_años         < 2.5        to the left,  improve=38.7772100, (0 missing)
      edad                < 29.5       to the left,  improve= 2.6184250, (0 missing)
      ingreso             < 54326.92   to the right, improve= 1.2175520, (0 missing)
      ratio_deuda_ingreso < 0.04125637 to the right, improve= 0.9188392, (0 missing)
      deuda_actual        < 2175.068   to the right, improve= 0.8590359, (0 missing)
  Surrogate splits:
      edad < 18.5       to the left,  agree=0.862, adj=0.036, (0 split)

Node number 30: 28 observations
  predicted class=No  expected loss=0.07142857  P(node) =0.04
    class counts:    26     2
   probabilities: 0.929 0.071 

Node number 31: 168 observations
  predicted class=Sí  expected loss=0.0297619  P(node) =0.24
    class counts:     5   163
   probabilities: 0.030 0.970 
# Importancia de variables
print("Importancia de Variables:")
[1] "Importancia de Variables:"
print(arbol_credito$variable.importance)
ratio_deuda_ingreso             ingreso   historial_credito         empleo_años 
          76.215450           66.404593           59.069191           42.959101 
       deuda_actual                edad           educacion 
          35.851024            3.301600            0.871227 
# Predicciones
predicciones <- predict(arbol_credito, test_data, type = "class")
matriz_confusion <- table(test_data$aprobado, predicciones)
print("Matriz de Confusión:")
[1] "Matriz de Confusión:"
print(matriz_confusion)
    predicciones
      No  Sí
  No 205   4
  Sí  21  70
# Precisión
precision <- sum(diag(matriz_confusion)) / sum(matriz_confusion)
print(paste("Precisión del modelo:", round(precision, 3)))
[1] "Precisión del modelo: 0.917"