MODELADO

# ═══════════════════════════════════════════════════════════════════════════════
#  PASO 4 — MODELADO PREDICTIVO  |  BASE I2C — CARTERA
#  Proyecto  : Analítica Predictiva — Metodología CRISP-DM
#  Archivo   : Base_Modelado_I2C_Lista_1.xlsx
#              141.669 registros · 33 variables · Solo facturas RV
#
#  Distribución del target:
#    · Pago_Oportuno_Bin = 1 (oportuno) : 138.919  (98,1%)
#    · Pago_Oportuno_Bin = 0 (mora)     :   2.742  ( 1,9%)
#    → Desbalance severo: estrategia ROSE + SMOTE obligatoria
#
#  Variables objetivo disponibles:
#    · Pago_Oportuno_Bin  → Clasificación binaria   (0 = mora, 1 = oportuno)
#    · Arrears_calc       → Regresión continua       (días de atraso)
#    · Bucket_Mora_calc   → Clasificación multiclase (tramo de mora)
#
#  ESTRUCTURA DEL SCRIPT
#  ─────────────────────
#  SEC  0  ▸ Librerías
#  SEC  1  ▸ Carga y validación de datos
#  SEC  2  ▸ Análisis exploratorio (EDA confirmatorio)
#  SEC  3  ▸ Preparación del dataset de modelado
#  SEC  4  ▸ División train / test
#  SEC  5  ▸ MODELO 1 — Regresión lineal (Arrears_calc)
#  SEC  6  ▸ MODELO 2 — Regresión logística (Pago_Oportuno_Bin)
#  SEC  7  ▸ Balanceo de clases (ROSE y SMOTE)
#  SEC  8  ▸ MODELO 3 — Regresión logística con datos balanceados
#  SEC  9  ▸ MODELO 4 — k-NN con datos balanceados
#  SEC 10  ▸ Comparación y selección de modelos
#  SEC 11  ▸ Gráficos consolidados
#  SEC 12  ▸ Exportación de resultados
# ═══════════════════════════════════════════════════════════════════════════════


# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 0 ▸ LIBRERÍAS
# ───────────────────────────────────────────────────────────────────────────────

paquetes <- c(
  "readxl",        # Carga del Excel
  "tidyverse",     # Manipulación de datos y ggplot2
  "janitor",       # Limpieza de nombres
  "psych",         # describe() — estadísticas descriptivas
  "corrplot",      # Visualización de correlaciones
  "lmtest",        # dwtest() — diagnóstico de regresión
  "leaps",         # regsubsets() — selección de variables
  "ROSE",          # ovun.sample() — balanceo ROSE
  "smotefamily",   # SMOTE() — síntesis de datos
  "class",         # knn() — k vecinos más cercanos
  "caret",         # confusionMatrix() — evaluación de modelos
  "patchwork",     # Paneles de gráficos
  "scales"         # Formateo de ejes
)

for (pkg in paquetes) {
  if (!requireNamespace(pkg, quietly = TRUE))
    install.packages(pkg, repos = "https://cran.r-project.org")
  suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}

## Warning: package 'readxl' was built under R version 4.5.3

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'dplyr' was built under R version 4.5.3

## Warning: package 'stringr' was built under R version 4.5.3

## Warning: package 'forcats' was built under R version 4.5.3

## Warning: package 'janitor' was built under R version 4.5.3

## Warning: package 'psych' was built under R version 4.5.3

## Warning: package 'smotefamily' was built under R version 4.5.3

## Warning: package 'patchwork' was built under R version 4.5.3

## Warning: package 'scales' was built under R version 4.5.3

cat("✓ Librerías cargadas\n\n")

## ✓ Librerías cargadas

# ── Paleta y tema visual ──────────────────────────────────────────────────────
C1 <- "#1B3A6B"  # azul — principal
C2 <- "#E84040"  # rojo  — mora / alerta
C3 <- "#F5A623"  # naranja — advertencia
C4 <- "#27AE60"  # verde — oportuno / positivo
C5 <- "#8E44AD"  # morado — acento

PALETA_BUCKET <- c(
  "Al dia"  = C4, "1-30"    = "#85C17E",
  "31-60"   = C3, "61-90"   = "#E8A323",
  "91-180"  = "#D4691E", "181-360" = C2, ">360" = "#7B0000"
)

tema_base <- theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13, color = C1),
    plot.subtitle = element_text(size = 10, color = "gray40"),
    plot.caption  = element_text(size = 8, color = "gray55"),
    axis.title    = element_text(size = 10),
    legend.position  = "bottom",
    panel.grid.minor = element_blank()
  )

set.seed(42)  # Semilla global para reproducibilidad


# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 1 ▸ CARGA Y VALIDACIÓN DE DATOS
# ───────────────────────────────────────────────────────────────────────────────

cat("═══════════════════════════════════════════════════════\n")

## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 1 — CARGA Y VALIDACIÓN\n")

##   SECCIÓN 1 — CARGA Y VALIDACIÓN

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# ── Ajusta la ruta si el archivo está en otro directorio ─────────────────────
RUTA <- "C:/Users/jcabia01/Downloads/Base_Modelado_I2C_Lista_1.xlsx"

cat("Cargando archivo...\n")

## Cargando archivo...

df_raw <- read_excel(RUTA, sheet = "Modelado_Final", guess_max = 5000)
cat("✓ Cargado:", formatC(nrow(df_raw), big.mark = ","),
    "filas ×", ncol(df_raw), "columnas\n\n")

## ✓ Cargado: 141,669 filas × 33 columnas

# Vista estructural
str(df_raw)

## tibble [141,669 × 33] (S3: tbl_df/tbl/data.frame)
##  $ Anon_Customer_ID          : chr [1:141669] "CUST_139" "CUST_139" "CUST_158" "CUST_158" ...
##  $ Anon_Document_ID          : chr [1:141669] "XXXXXX7262" "XXXXXX7465" "XXXXXX2943" "XXXXXX4030" ...
##  $ Terms_of_Payment          : chr [1:141669] "Z540" "Z540" "Z914" "Z914" ...
##  $ Document_Type             : chr [1:141669] "RV" "RV" "RV" "RV" ...
##  $ Document_Date             : POSIXct[1:141669], format: "2026-03-20" "2026-03-24" ...
##  $ Payment_date              : POSIXct[1:141669], format: "2026-05-24" "2026-05-25" ...
##  $ Net_due_date              : POSIXct[1:141669], format: "2026-05-24" "2026-05-25" ...
##  $ Clearing_date             : POSIXct[1:141669], format: NA NA ...
##  $ Arrears_after_net_due_date: num [1:141669] -56 -57 -19 -26 -32 -32 -44 -20 -21 -21 ...
##  $ Amount_in_local_currency  : num [1:141669] 3.73e+07 1.83e+07 1.00e+08 1.29e+08 3.32e+07 ...
##  $ Reason_code               : chr [1:141669] "Sin_Codigo" "Sin_Codigo" "Sin_Codigo" "Sin_Codigo" ...
##  $ Year/month                : chr [1:141669] "2026/03" "2026/03" "2026/03" "2026/03" ...
##  $ Estado_Cartera            : chr [1:141669] "Abierta" "Abierta" "Abierta" "Abierta" ...
##  $ Arrears_calc              : num [1:141669] -44 -45 -7 -14 -20 -20 -32 -8 -9 -9 ...
##  $ Pago_Oportuno_Bin         : num [1:141669] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Bucket_Mora_calc          : chr [1:141669] "Al dia" "Al dia" "Al dia" "Al dia" ...
##  $ Prom_Mora_Cliente         : num [1:141669] -0.43 -0.43 -15.17 -15.17 -15.17 ...
##  $ Max_Mora_Cliente          : num [1:141669] 0 0 -7 -7 -7 -7 -7 -8 -8 -8 ...
##  $ Min_Mora_Cliente          : num [1:141669] -45 -45 -32 -32 -32 -32 -32 -31 -31 -31 ...
##  $ Desv_Mora_Cliente         : num [1:141669] 4.35 4.35 1.95 1.95 1.95 1.95 1.95 1.86 1.86 1.86 ...
##  $ Freq_Mora_Cliente         : num [1:141669] 0 0 0 0 0 0 0 0 0 0 ...
##  $ N_Facturas_Cliente        : num [1:141669] 416 416 212 212 212 ...
##  $ Mes_Documento             : num [1:141669] 3 3 2 3 3 3 3 3 3 3 ...
##  $ Trimestre_Documento       : num [1:141669] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Anio_Documento            : num [1:141669] 2026 2026 2026 2026 2026 ...
##  $ Dias_Hasta_Vencimiento    : num [1:141669] 65 62 49 48 47 47 47 47 48 48 ...
##  $ Antiguedad_Documento      : num [1:141669] 21 17 42 34 27 27 15 39 39 39 ...
##  $ Monto_Absoluto            : num [1:141669] 3.73e+07 1.83e+07 1.00e+08 1.29e+08 3.32e+07 ...
##  $ Log_Monto                 : num [1:141669] 17.4 16.7 18.4 18.7 17.3 ...
##  $ Terms_Grupo               : chr [1:141669] "Z540" "Z540" "Z914" "Z914" ...
##  $ Estado_Num                : num [1:141669] 1 1 1 1 1 1 1 1 1 1 ...
##  $ flag_fecha_inconsistente  : num [1:141669] 0 0 0 0 0 0 0 0 0 0 ...
##  $ flag_amount_negativo      : num [1:141669] 0 0 0 0 0 0 0 0 0 0 ...

cat("\n")

print(summary(df_raw))

##  Anon_Customer_ID   Anon_Document_ID   Terms_of_Payment   Document_Type     
##  Length:141669      Length:141669      Length:141669      Length:141669     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Document_Date                  Payment_date                
##  Min.   :2022-01-03 00:00:00   Min.   :2022-01-18 00:00:00  
##  1st Qu.:2022-12-19 00:00:00   1st Qu.:2023-01-27 00:00:00  
##  Median :2024-01-25 00:00:00   Median :2024-03-02 00:00:00  
##  Mean   :2024-01-30 13:02:56   Mean   :2024-03-08 16:52:00  
##  3rd Qu.:2025-03-10 00:00:00   3rd Qu.:2025-04-17 00:00:00  
##  Max.   :2026-03-26 00:00:00   Max.   :2026-06-25 00:00:00  
##                                                             
##   Net_due_date                 Clearing_date                
##  Min.   :2022-01-18 00:00:00   Min.   :2022-01-04 00:00:00  
##  1st Qu.:2023-02-03 00:00:00   1st Qu.:2023-01-20 00:00:00  
##  Median :2024-03-09 00:00:00   Median :2024-01-25 00:00:00  
##  Mean   :2024-03-15 08:56:22   Mean   :2024-02-06 15:38:05  
##  3rd Qu.:2025-04-28 00:00:00   3rd Qu.:2025-02-17 00:00:00  
##  Max.   :2026-06-25 00:00:00   Max.   :2026-03-30 00:00:00  
##                                NA's   :7320                 
##  Arrears_after_net_due_date Amount_in_local_currency Reason_code       
##  Min.   :-382.000           Min.   : -80962384       Length:141669     
##  1st Qu.: -10.000           1st Qu.:   1514037       Class :character  
##  Median :   4.000           Median :   6131832       Mode  :character  
##  Mean   :   2.728           Mean   :  23118025                         
##  3rd Qu.:  14.000           3rd Qu.:  20395610                         
##  Max.   : 577.000           Max.   :1611343446                         
##                                                                        
##   Year/month        Estado_Cartera      Arrears_calc     Pago_Oportuno_Bin
##  Length:141669      Length:141669      Min.   :-76.000   Min.   :0.0000   
##  Class :character   Class :character   1st Qu.:-15.000   1st Qu.:1.0000   
##  Mode  :character   Mode  :character   Median :  0.000   Median :1.0000   
##                                        Mean   : -6.478   Mean   :0.9806   
##                                        3rd Qu.:  0.000   3rd Qu.:1.0000   
##                                        Max.   :586.000   Max.   :1.0000   
##                                        NA's   :8         NA's   :8        
##  Bucket_Mora_calc   Prom_Mora_Cliente Max_Mora_Cliente Min_Mora_Cliente
##  Length:141669      Min.   :-23.000   Min.   :-15.00   Min.   :-76.00  
##  Class :character   1st Qu.:-14.970   1st Qu.:  0.00   1st Qu.:-32.00  
##  Mode  :character   Median : -0.630   Median : 26.00   Median :-28.00  
##                     Mean   : -6.479   Mean   : 58.51   Mean   :-25.31  
##                     3rd Qu.:  0.150   3rd Qu.:134.00   3rd Qu.:-21.00  
##                     Max.   : 19.650   Max.   :586.00   Max.   :  0.00  
##                                                                        
##  Desv_Mora_Cliente Freq_Mora_Cliente N_Facturas_Cliente Mes_Documento   
##  Min.   :  0.000   Min.   :0.00000   Min.   :    1      Min.   : 1.000  
##  1st Qu.:  2.110   1st Qu.:0.00000   1st Qu.:  941      1st Qu.: 3.000  
##  Median :  2.260   Median :0.01710   Median : 4044      Median : 6.000  
##  Mean   :  4.007   Mean   :0.01936   Mean   :16159      Mean   : 5.951  
##  3rd Qu.:  3.480   3rd Qu.:0.03480   3rd Qu.:44940      3rd Qu.: 9.000  
##  Max.   :103.360   Max.   :0.44440   Max.   :44940      Max.   :12.000  
##  NA's   :6                                                              
##  Trimestre_Documento Anio_Documento Dias_Hasta_Vencimiento Antiguedad_Documento
##  Min.   :1.000       Min.   :2022   Min.   :-272.00        Min.   :  15.0      
##  1st Qu.:1.000       1st Qu.:2022   1st Qu.:  38.00        1st Qu.: 396.0      
##  Median :2.000       Median :2024   Median :  47.00        Median : 806.0      
##  Mean   :2.321       Mean   :2024   Mean   :  44.83        Mean   : 800.5      
##  3rd Qu.:3.000       3rd Qu.:2025   3rd Qu.:  49.00        3rd Qu.:1208.0      
##  Max.   :4.000       Max.   :2026   Max.   : 417.00        Max.   :1558.0      
##                                                                                
##  Monto_Absoluto        Log_Monto      Terms_Grupo          Estado_Num     
##  Min.   :5.560e+02   Min.   : 6.323   Length:141669      Min.   :0.00000  
##  1st Qu.:1.515e+06   1st Qu.:14.231   Class :character   1st Qu.:0.00000  
##  Median :6.134e+06   Median :15.629   Mode  :character   Median :0.00000  
##  Mean   :2.312e+07   Mean   :15.517                      Mean   :0.05151  
##  3rd Qu.:2.040e+07   3rd Qu.:16.831                      3rd Qu.:0.00000  
##  Max.   :1.611e+09   Max.   :21.200                      Max.   :1.00000  
##                                                                           
##  flag_fecha_inconsistente flag_amount_negativo
##  Min.   :0.000e+00        Min.   :0.0000000   
##  1st Qu.:0.000e+00        1st Qu.:0.0000000   
##  Median :0.000e+00        Median :0.0000000   
##  Mean   :5.647e-05        Mean   :0.0001623   
##  3rd Qu.:0.000e+00        3rd Qu.:0.0000000   
##  Max.   :1.000e+00        Max.   :1.0000000   
##

# ── Limpiar nombres de columnas ───────────────────────────────────────────────
df <- df_raw %>% clean_names()

cat("\nNombres de columnas limpios:\n")

## 
## Nombres de columnas limpios:

print(names(df))

##  [1] "anon_customer_id"           "anon_document_id"          
##  [3] "terms_of_payment"           "document_type"             
##  [5] "document_date"              "payment_date"              
##  [7] "net_due_date"               "clearing_date"             
##  [9] "arrears_after_net_due_date" "amount_in_local_currency"  
## [11] "reason_code"                "year_month"                
## [13] "estado_cartera"             "arrears_calc"              
## [15] "pago_oportuno_bin"          "bucket_mora_calc"          
## [17] "prom_mora_cliente"          "max_mora_cliente"          
## [19] "min_mora_cliente"           "desv_mora_cliente"         
## [21] "freq_mora_cliente"          "n_facturas_cliente"        
## [23] "mes_documento"              "trimestre_documento"       
## [25] "anio_documento"             "dias_hasta_vencimiento"    
## [27] "antiguedad_documento"       "monto_absoluto"            
## [29] "log_monto"                  "terms_grupo"               
## [31] "estado_num"                 "flag_fecha_inconsistente"  
## [33] "flag_amount_negativo"

# ── Conversión de tipos ───────────────────────────────────────────────────────
df <- df %>%
  mutate(
    # Fechas
    document_date = as.Date(document_date),
    payment_date  = as.Date(payment_date),
    net_due_date  = as.Date(net_due_date),
    clearing_date = as.Date(clearing_date),

    # Numéricas continuas (ya deberían ser numéricas, forzar por seguridad)
    arrears_calc             = as.numeric(arrears_calc),
    amount_in_local_currency = as.numeric(amount_in_local_currency),
    log_monto                = as.numeric(log_monto),
    monto_absoluto           = as.numeric(monto_absoluto),
    prom_mora_cliente        = as.numeric(prom_mora_cliente),
    max_mora_cliente         = as.numeric(max_mora_cliente),
    min_mora_cliente         = as.numeric(min_mora_cliente),
    desv_mora_cliente        = as.numeric(desv_mora_cliente),
    freq_mora_cliente        = as.numeric(freq_mora_cliente),
    dias_hasta_vencimiento   = as.numeric(dias_hasta_vencimiento),
    antiguedad_documento     = as.numeric(antiguedad_documento),

    # Enteras
    n_facturas_cliente       = as.integer(n_facturas_cliente),
    mes_documento            = as.integer(mes_documento),
    trimestre_documento      = as.integer(trimestre_documento),
    anio_documento           = as.integer(anio_documento),
    estado_num               = as.integer(estado_num),
    flag_fecha_inconsistente = as.integer(flag_fecha_inconsistente),
    flag_amount_negativo     = as.integer(flag_amount_negativo),

    # Factores (categóricas)
    terms_of_payment = as.factor(terms_of_payment),
    terms_grupo      = as.factor(terms_grupo),
    estado_cartera   = as.factor(estado_cartera),
    bucket_mora_calc = factor(bucket_mora_calc,
      levels  = c("Al dia","1-30","31-60","61-90","91-180","181-360",">360"),
      ordered = TRUE),

    # Variable objetivo binaria — FACTOR con niveles explícitos
    pago_oportuno_bin = factor(pago_oportuno_bin,
                               levels = c(0, 1),
                               labels = c("Mora","Oportuno"))
  )

cat("\n✓ Tipos de datos asignados\n")

## 
## ✓ Tipos de datos asignados

cat("  Clientes únicos  :", n_distinct(df$anon_customer_id), "\n")

##   Clientes únicos  : 265

cat("  Documentos únicos:", n_distinct(df$anon_document_id), "\n")

##   Documentos únicos: 10000

cat("  Flags (fecha)    :", sum(df$flag_fecha_inconsistente, na.rm = TRUE), "\n")

##   Flags (fecha)    : 8

cat("  Flags (monto neg):", sum(df$flag_amount_negativo, na.rm = TRUE), "\n\n")

##   Flags (monto neg): 23

# Distribución del target
cat("── Distribución Pago_Oportuno_Bin ──\n")

## ── Distribución Pago_Oportuno_Bin ──

print(table(df$pago_oportuno_bin))

## 
##     Mora Oportuno 
##     2742   138919

print(prop.table(table(df$pago_oportuno_bin)))

## 
##       Mora   Oportuno 
## 0.01935607 0.98064393

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 2 ▸ ANÁLISIS EXPLORATORIO (EDA CONFIRMATORIO)
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 2 — EDA CONFIRMATORIO\n")

##   SECCIÓN 2 — EDA CONFIRMATORIO

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# ── 2.1 Estadísticas descriptivas completas (psych) ──────────────────────────
cat("── Estadísticas descriptivas (psych::describe) ──\n")

## ── Estadísticas descriptivas (psych::describe) ──

vars_desc <- df %>%
  select(arrears_calc, amount_in_local_currency, log_monto,
         prom_mora_cliente, max_mora_cliente, freq_mora_cliente,
         dias_hasta_vencimiento, antiguedad_documento) %>%
  as.data.frame()
print(describe(vars_desc))

##                          vars      n        mean          sd     median
## arrears_calc                1 141661       -6.48       12.72       0.00
## amount_in_local_currency    2 141669 23118024.97 59330124.74 6131832.00
## log_monto                   3 141669       15.52        1.81      15.63
## prom_mora_cliente           4 141669       -6.48        7.40      -0.63
## max_mora_cliente            5 141669       58.51       84.03      26.00
## freq_mora_cliente           6 141669        0.02        0.02       0.02
## dias_hasta_vencimiento      7 141669       44.83        8.19      47.00
## antiguedad_documento        8 141669      800.46      461.30     806.00
##                              trimmed        mad          min          max
## arrears_calc                   -6.50       0.00       -76.00 5.860000e+02
## amount_in_local_currency 11042210.09 8167690.84 -80962383.99 1.611343e+09
## log_monto                      15.54       1.91         6.32 2.120000e+01
## prom_mora_cliente              -6.42       1.29       -23.00 1.965000e+01
## max_mora_cliente               50.39      47.44       -15.00 5.860000e+02
## freq_mora_cliente               0.02       0.03         0.00 4.400000e-01
## dias_hasta_vencimiento         44.60       7.41      -272.00 4.170000e+02
## antiguedad_documento          804.92     601.94        15.00 1.558000e+03
##                                 range  skew kurtosis        se
## arrears_calc             6.620000e+02 26.67  1177.26      0.03
## amount_in_local_currency 1.692306e+09  7.46    79.04 157629.63
## log_monto                1.488000e+01 -0.10    -0.48      0.00
## prom_mora_cliente        4.265000e+01  0.02    -1.23      0.02
## max_mora_cliente         6.010000e+02  2.94    14.74      0.22
## freq_mora_cliente        4.400000e-01  2.77    37.59      0.00
## dias_hasta_vencimiento   6.890000e+02  1.56    94.56      0.02
## antiguedad_documento     1.543000e+03 -0.06    -1.22      1.23

# ── 2.2 Frecuencias de categóricas ───────────────────────────────────────────
cat("\n── Frecuencias — Terms_of_Payment ──\n")

## 
## ── Frecuencias — Terms_of_Payment ──

print(table(df$terms_of_payment))

## 
##  B030  B045  P030  Z000  Z014  Z040  Z045  Z090  Z502  Z505  Z516  Z521  Z522 
##     9    63    91     8     3   269     3    73    38   191    15  9685 44944 
##  Z525  Z526  Z540  Z668  Z672  Z691  Z827  Z913  Z914 
##  1131 14591  3887    36   644   195    48  8263 57482

print(prop.table(table(df$terms_of_payment)))

## 
##         B030         B045         P030         Z000         Z014         Z040 
## 6.352837e-05 4.446986e-04 6.423424e-04 5.646966e-05 2.117612e-05 1.898792e-03 
##         Z045         Z090         Z502         Z505         Z516         Z521 
## 2.117612e-05 5.152856e-04 2.682309e-04 1.348213e-03 1.058806e-04 6.836358e-02 
##         Z522         Z525         Z526         Z540         Z668         Z672 
## 3.172465e-01 7.983398e-03 1.029936e-01 2.743720e-02 2.541135e-04 4.545807e-03 
##         Z691         Z827         Z913         Z914 
## 1.376448e-03 3.388179e-04 5.832610e-02 4.057486e-01

cat("\n── Frecuencias — Estado_Cartera ──\n")

## 
## ── Frecuencias — Estado_Cartera ──

print(table(df$estado_cartera))

## 
## Abierta Cerrada 
##    7298  134371

cat("\n── Frecuencias — Bucket_Mora ──\n")

## 
## ── Frecuencias — Bucket_Mora ──

print(table(df$bucket_mora_calc))

## 
##  Al dia    1-30   31-60   61-90  91-180 181-360    >360 
##  138919    2609      77       3       4       6      43

print(prop.table(table(df$bucket_mora_calc)))

## 
##       Al dia         1-30        31-60        61-90       91-180      181-360 
## 9.806439e-01 1.841721e-02 5.435512e-04 2.117732e-05 2.823642e-05 4.235464e-05 
##         >360 
## 3.035416e-04

cat("\n── Frecuencias — Terms_Grupo ──\n")

## 
## ── Frecuencias — Terms_Grupo ──

print(table(df$terms_grupo))

## 
## Otros  Z521  Z522  Z526  Z540  Z913  Z914 
##  2817  9685 44944 14591  3887  8263 57482

# ── 2.3 Matriz de correlación ────────────────────────────────────────────────
cat("\n── Matriz de correlación ──\n")

## 
## ── Matriz de correlación ──

vars_cor <- df %>%
  select(arrears_calc, log_monto, prom_mora_cliente, max_mora_cliente,
         freq_mora_cliente, desv_mora_cliente, dias_hasta_vencimiento,
         antiguedad_documento, mes_documento, estado_num) %>%
  drop_na() %>%
  as.data.frame()

mat_cor <- cor(vars_cor)
print(round(mat_cor, 3))

##                        arrears_calc log_monto prom_mora_cliente
## arrears_calc                  1.000    -0.080             0.582
## log_monto                    -0.080     1.000            -0.142
## prom_mora_cliente             0.582    -0.142             1.000
## max_mora_cliente              0.391    -0.177             0.672
## freq_mora_cliente             0.289    -0.187             0.496
## desv_mora_cliente             0.174     0.139             0.300
## dias_hasta_vencimiento       -0.219     0.072            -0.369
## antiguedad_documento          0.015    -0.090             0.046
## mes_documento                -0.013     0.015            -0.022
## estado_num                    0.071     0.033             0.009
##                        max_mora_cliente freq_mora_cliente desv_mora_cliente
## arrears_calc                      0.391             0.289             0.174
## log_monto                        -0.177            -0.187             0.139
## prom_mora_cliente                 0.672             0.496             0.300
## max_mora_cliente                  1.000             0.508             0.672
## freq_mora_cliente                 0.508             1.000             0.132
## desv_mora_cliente                 0.672             0.132             1.000
## dias_hasta_vencimiento           -0.318            -0.247            -0.049
## antiguedad_documento              0.055            -0.063             0.029
## mes_documento                    -0.013            -0.022             0.008
## estado_num                       -0.001             0.073             0.008
##                        dias_hasta_vencimiento antiguedad_documento
## arrears_calc                           -0.219                0.015
## log_monto                               0.072               -0.090
## prom_mora_cliente                      -0.369                0.046
## max_mora_cliente                       -0.318                0.055
## freq_mora_cliente                      -0.247               -0.063
## desv_mora_cliente                      -0.049                0.029
## dias_hasta_vencimiento                  1.000               -0.163
## antiguedad_documento                   -0.163                1.000
## mes_documento                           0.055               -0.066
## estado_num                              0.049               -0.382
##                        mes_documento estado_num
## arrears_calc                  -0.013      0.071
## log_monto                      0.015      0.033
## prom_mora_cliente             -0.022      0.009
## max_mora_cliente              -0.013     -0.001
## freq_mora_cliente             -0.022      0.073
## desv_mora_cliente              0.008      0.008
## dias_hasta_vencimiento         0.055      0.049
## antiguedad_documento          -0.066     -0.382
## mes_documento                  1.000     -0.233
## estado_num                    -0.233      1.000

corrplot(mat_cor,
  method      = "color",
  type        = "lower",
  addCoef.col = "black",
  number.cex  = 0.75,
  tl.cex      = 0.85,
  tl.col      = C1,
  col         = colorRampPalette(c(C2, "white", C1))(200),
  title       = "Correlación — Variables de Modelado",
  mar         = c(0, 0, 2, 0)
)

# ── 2.4 Gráficos exploratorios ───────────────────────────────────────────────
# Distribución de Bucket_Mora
p_bkt <- df %>%
  filter(!is.na(bucket_mora_calc)) %>%
  count(bucket_mora_calc) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  ggplot(aes(x = bucket_mora_calc, y = n, fill = bucket_mora_calc)) +
  geom_col(alpha = 0.9, show.legend = FALSE) +
  geom_text(aes(label = paste0(pct, "%")), vjust = -0.4, size = 3.5, fontface = "bold") +
  scale_fill_manual(values = PALETA_BUCKET) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.12))) +
  labs(title = "Distribución por Bucket_Mora", x = NULL, y = "Registros") +
  tema_base

# Target binario
tab_target <- df %>%
  count(pago_oportuno_bin) %>%
  mutate(pct = round(n / sum(n) * 100, 1))

p_target <- ggplot(tab_target, aes(x = pago_oportuno_bin, y = n,
                                    fill = pago_oportuno_bin)) +
  geom_col(alpha = 0.9, show.legend = FALSE) +
  geom_text(aes(label = paste0(comma(n), "\n(", pct, "%)")),
            vjust = -0.3, size = 4, fontface = "bold") +
  scale_fill_manual(values = c("Mora" = C2, "Oportuno" = C4)) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(title = "Variable Objetivo: Pago_Oportuno_Bin",
       subtitle = "Desbalance severo: 98,1% oportuno vs 1,9% mora",
       x = NULL, y = "Registros") +
  tema_base

# Arrears por condición de pago
p_arr_terms <- df %>%
  filter(!is.na(arrears_calc), !is.na(terms_grupo)) %>%
  ggplot(aes(x = reorder(terms_grupo, arrears_calc, FUN = median),
             y = arrears_calc, fill = terms_grupo)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.1, show.legend = FALSE) +
  geom_hline(yintercept = 0, linetype = "dashed", color = C2, linewidth = 0.8) +
  scale_y_continuous(labels = comma) +
  labs(title = "Arrears por Condición de Pago",
       subtitle = "Línea roja = 0 días (umbral mora)",
       x = "Terms_Grupo", y = "Días de atraso") +
  tema_base +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Dispersión: Log_Monto vs Arrears
set.seed(42)
df_sample <- df %>%
  filter(!is.na(arrears_calc), !is.na(log_monto)) %>%
  slice_sample(n = 5000)

p_scatter <- ggplot(df_sample,
                    aes(x = log_monto, y = arrears_calc,
                        color = pago_oportuno_bin)) +
  geom_point(alpha = 0.25, size = 0.9) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  scale_color_manual(values = c("Mora" = C2, "Oportuno" = C4),
                     name = "Estado pago") +
  labs(title = "Scatter: Log(Monto) vs Arrears",
       subtitle = "Muestra aleatoria de 5.000 registros",
       x = "Log(Monto)", y = "Días de atraso (Arrears_calc)") +
  tema_base

panel_eda <- (p_target | p_bkt) / (p_arr_terms | p_scatter) +
  plot_annotation(
    title = "EDA Confirmatorio — Paso 4 Modelado I2C",
    theme = theme(plot.title = element_text(face = "bold", size = 14, color = C1))
  )
print(panel_eda)

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 3 ▸ PREPARACIÓN DEL DATASET DE MODELADO
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 3 — PREPARACIÓN DEL DATASET\n")

##   SECCIÓN 3 — PREPARACIÓN DEL DATASET

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# ── 3.1 Filtrar anomalías y construir dataset limpio ─────────────────────────
df_work <- df %>%
  filter(
    flag_fecha_inconsistente == 0,   # excluir 8 registros con fechas incoherentes
    flag_amount_negativo     == 0,   # excluir 23 registros con monto negativo
    !is.na(arrears_calc),
    !is.na(pago_oportuno_bin),
    !is.na(log_monto),
    !is.na(prom_mora_cliente),
    !is.na(dias_hasta_vencimiento)
  ) %>%
  as.data.frame()

cat("Registros para modelado:", formatC(nrow(df_work), big.mark = ","), "\n")

## Registros para modelado: 141,638

cat("  (excluidos: 8 fechas inconsistentes + 23 montos negativos + NAs)\n\n")

##   (excluidos: 8 fechas inconsistentes + 23 montos negativos + NAs)

# ── 3.2 Definir variables predictoras por modelo ─────────────────────────────
# Features para REGRESIÓN LINEAL (Arrears_calc ~ ...)
VARS_LM <- c(
  "log_monto", "prom_mora_cliente", "max_mora_cliente",
  "freq_mora_cliente", "desv_mora_cliente",
  "dias_hasta_vencimiento", "mes_documento",
  "trimestre_documento", "anio_documento",
  "antiguedad_documento", "estado_num", "terms_grupo"
)

# Features para CLASIFICACIÓN (Pago_Oportuno_Bin ~ ...)
VARS_CLAS <- c(
  "log_monto", "prom_mora_cliente", "max_mora_cliente",
  "freq_mora_cliente", "desv_mora_cliente",
  "dias_hasta_vencimiento", "mes_documento",
  "trimestre_documento", "estado_num", "terms_grupo"
)

cat("Variables predictoras — Regresión lineal  :", length(VARS_LM), "\n")

## Variables predictoras — Regresión lineal  : 12

cat("Variables predictoras — Clasificación     :", length(VARS_CLAS), "\n\n")

## Variables predictoras — Clasificación     : 10

# ── 3.3 Transformaciones adicionales para mejorar modelos ────────────────────
df_work <- df_work %>%
  mutate(
    # Cuadrado de log_monto (captura no-linealidad en monto)
    log_monto_sq    = log_monto^2,
    # Inverso de días hasta vencimiento (plazos cortos tienen mayor impacto)
    inv_dias_vcto   = if_else(dias_hasta_vencimiento > 0,
                               1 / dias_hasta_vencimiento, NA_real_),
    # Interacción: frecuencia de mora × promedio mora (score compuesto)
    score_riesgo    = freq_mora_cliente * abs(prom_mora_cliente),
    # Monto en millones (más interpretable)
    monto_millones  = monto_absoluto / 1e6
  )

cat("✓ Transformaciones adicionales creadas:\n")

## ✓ Transformaciones adicionales creadas:

cat("  · log_monto_sq   = log_monto²  (no-linealidad)\n")

##   · log_monto_sq   = log_monto²  (no-linealidad)

cat("  · inv_dias_vcto  = 1/días_vencimiento (urgencia del plazo)\n")

##   · inv_dias_vcto  = 1/días_vencimiento (urgencia del plazo)

cat("  · score_riesgo   = freq_mora × |prom_mora| (score compuesto)\n")

##   · score_riesgo   = freq_mora × |prom_mora| (score compuesto)

cat("  · monto_millones = Amount / 1.000.000\n\n")

##   · monto_millones = Amount / 1.000.000

# Convertir terms_grupo a dummies para regresión
df_work$terms_grupo <- as.factor(df_work$terms_grupo)


# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 4 ▸ DIVISIÓN TRAIN / TEST  (70% / 30%)
# ───────────────────────────────────────────────────────────────────────────────

cat("═══════════════════════════════════════════════════════\n")

## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 4 — DIVISIÓN TRAIN / TEST\n")

##   SECCIÓN 4 — DIVISIÓN TRAIN / TEST

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

set.seed(42)
n_total   <- nrow(df_work)
idx_train <- sample.int(n_total, size = floor(0.70 * n_total))
df_train  <- df_work[ idx_train, ]
df_test   <- df_work[-idx_train, ]

cat("  Total disponible :", formatC(n_total,         big.mark = ","), "\n")

##   Total disponible : 141,638

cat("  Train  (70%)     :", formatC(nrow(df_train),  big.mark = ","), "\n")

##   Train  (70%)     : 99,146

cat("  Test   (30%)     :", formatC(nrow(df_test),   big.mark = ","), "\n\n")

##   Test   (30%)     : 42,492

cat("Proporción Pago_Oportuno_Bin en TRAIN:\n")

## Proporción Pago_Oportuno_Bin en TRAIN:

print(prop.table(table(df_train$pago_oportuno_bin)))

## 
##       Mora   Oportuno 
## 0.01966796 0.98033204

cat("Proporción Pago_Oportuno_Bin en TEST:\n")

## Proporción Pago_Oportuno_Bin en TEST:

print(prop.table(table(df_test$pago_oportuno_bin)))

## 
##      Mora  Oportuno 
## 0.0186388 0.9813612

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 5 ▸ MODELO 1 — REGRESIÓN LINEAL MÚLTIPLE (Arrears_calc)
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 5 — MODELO 1: REGRESIÓN LINEAL (Arrears)\n")

##   SECCIÓN 5 — MODELO 1: REGRESIÓN LINEAL (Arrears)

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# ── 5.1 Fórmula del modelo completo ──────────────────────────────────────────
formula_lm <- as.formula(
  paste("arrears_calc ~",
        paste(c(VARS_LM, "log_monto_sq", "inv_dias_vcto", "score_riesgo"),
              collapse = " + "))
)

cat("Fórmula modelo lineal:\n"); print(formula_lm)

## Fórmula modelo lineal:

## arrears_calc ~ log_monto + prom_mora_cliente + max_mora_cliente + 
##     freq_mora_cliente + desv_mora_cliente + dias_hasta_vencimiento + 
##     mes_documento + trimestre_documento + anio_documento + antiguedad_documento + 
##     estado_num + terms_grupo + log_monto_sq + inv_dias_vcto + 
##     score_riesgo

# ── 5.2 Ajuste del modelo ────────────────────────────────────────────────────
cat("\nAjustando modelo de regresión lineal...\n")

## 
## Ajustando modelo de regresión lineal...

modelo_lm <- lm(formula_lm, data = df_train)

cat("\n── Resumen del modelo completo ──\n")

## 
## ── Resumen del modelo completo ──

print(summary(modelo_lm))

## 
## Call:
## lm(formula = formula_lm, data = df_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -78.19  -0.48   0.17   0.80 564.27 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.909e+03  3.047e+03   0.627  0.53097    
## log_monto               2.320e-01  2.693e-01   0.862  0.38893    
## prom_mora_cliente       4.640e-01  3.623e-02  12.808  < 2e-16 ***
## max_mora_cliente       -2.835e-03  1.759e-03  -1.612  0.10696    
## freq_mora_cliente      -8.485e+00  4.939e+00  -1.718  0.08578 .  
## desv_mora_cliente       1.186e-01  1.389e-02   8.539  < 2e-16 ***
## dias_hasta_vencimiento -4.424e-03  7.717e-03  -0.573  0.56645    
## mes_documento          -6.838e-01  1.287e-01  -5.311 1.09e-07 ***
## trimestre_documento     2.179e+00  1.326e-01  16.435  < 2e-16 ***
## anio_documento         -9.443e-01  1.504e+00  -0.628  0.53008    
## antiguedad_documento   -1.953e-03  4.110e-03  -0.475  0.63466    
## estado_num              5.613e+00  1.787e-01  31.409  < 2e-16 ***
## terms_grupoZ521         7.821e-01  2.950e-01   2.651  0.00803 ** 
## terms_grupoZ522         8.998e-01  3.635e-01   2.476  0.01330 *  
## terms_grupoZ526        -4.872e-01  2.755e-01  -1.769  0.07694 .  
## terms_grupoZ540        -9.733e-01  3.609e-01  -2.697  0.00700 ** 
## terms_grupoZ913        -4.902e+00  4.357e-01 -11.251  < 2e-16 ***
## terms_grupoZ914        -7.835e+00  5.612e-01 -13.960  < 2e-16 ***
## log_monto_sq           -8.042e-03  8.698e-03  -0.925  0.35519    
## inv_dias_vcto           9.906e-01  3.616e+00   0.274  0.78411    
## score_riesgo            7.836e-01  4.774e-01   1.641  0.10072    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.84 on 99121 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.3283, Adjusted R-squared:  0.3281 
## F-statistic:  2422 on 20 and 99121 DF,  p-value: < 2.2e-16

# Intervalos de confianza (95%)
cat("\n── Intervalos de confianza (95%) ──\n")

## 
## ── Intervalos de confianza (95%) ──

print(confint(modelo_lm))

##                                2.5 %        97.5 %
## (Intercept)            -4.063418e+03  7.881810e+03
## log_monto              -2.958383e-01  7.599304e-01
## prom_mora_cliente       3.930169e-01  5.350400e-01
## max_mora_cliente       -6.282501e-03  6.119953e-04
## freq_mora_cliente      -1.816521e+01  1.194566e+00
## desv_mora_cliente       9.141218e-02  1.458773e-01
## dias_hasta_vencimiento -1.954974e-02  1.070141e-02
## mes_documento          -9.360904e-01 -4.314455e-01
## trimestre_documento     1.919049e+00  2.438764e+00
## anio_documento         -3.891820e+00  2.003315e+00
## antiguedad_documento   -1.000811e-02  6.102293e-03
## estado_num              5.262637e+00  5.963141e+00
## terms_grupoZ521         2.038351e-01  1.360280e+00
## terms_grupoZ522         1.874727e-01  1.612211e+00
## terms_grupoZ526        -1.027146e+00  5.268713e-02
## terms_grupoZ540        -1.680627e+00 -2.659291e-01
## terms_grupoZ913        -5.756337e+00 -4.048360e+00
## terms_grupoZ914        -8.934659e+00 -6.734681e+00
## log_monto_sq           -2.508918e-02  9.005883e-03
## inv_dias_vcto          -6.095944e+00  8.077086e+00
## score_riesgo           -1.521058e-01  1.719371e+00

# ── 5.3 Diagnóstico gráfico ──────────────────────────────────────────────────
cat("\n── Diagnóstico gráfico del modelo lineal (4 paneles) ──\n")

## 
## ── Diagnóstico gráfico del modelo lineal (4 paneles) ──

par(mfrow = c(2, 2))
plot(modelo_lm, col = C1, pch = 19, cex = 0.4,
     main = "Diagnóstico Regresión Lineal")

par(mfrow = c(1, 1))

# Test de Durbin-Watson (autocorrelación de residuos)
cat("\n── Test Durbin-Watson ──\n")

## 
## ── Test Durbin-Watson ──

dw_result <- dwtest(modelo_lm)
print(dw_result)

## 
##  Durbin-Watson test
## 
## data:  modelo_lm
## DW = 1.9976, p-value = 0.3512
## alternative hypothesis: true autocorrelation is greater than 0

cat(sprintf("  DW = %.4f  |  p-valor = %.4f\n",
            dw_result$statistic, dw_result$p.value))

##   DW = 1.9976  |  p-valor = 0.3512

cat("  Interpretación: DW ≈ 2 → sin autocorrelación (ideal)\n")

##   Interpretación: DW ≈ 2 → sin autocorrelación (ideal)

# ── 5.4 Selección stepwise ───────────────────────────────────────────────────
cat("\n── Selección stepwise (AIC) ──\n")

## 
## ── Selección stepwise (AIC) ──

modelo_lm_step <- step(modelo_lm, direction = "both", trace = 0)
cat("\n► Modelo stepwise final:\n")

## 
## ► Modelo stepwise final:

print(summary(modelo_lm_step))

## 
## Call:
## lm(formula = arrears_calc ~ prom_mora_cliente + max_mora_cliente + 
##     freq_mora_cliente + desv_mora_cliente + mes_documento + trimestre_documento + 
##     anio_documento + estado_num + terms_grupo + score_riesgo, 
##     data = df_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -78.62  -0.47   0.17   0.80 564.17 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         476.000894  61.257554   7.770 7.89e-15 ***
## prom_mora_cliente     0.464679   0.035932  12.932  < 2e-16 ***
## max_mora_cliente     -0.002889   0.001752  -1.649 0.099203 .  
## freq_mora_cliente    -8.805533   4.924013  -1.788 0.073733 .  
## desv_mora_cliente     0.118590   0.013805   8.590  < 2e-16 ***
## mes_documento        -0.627573   0.043115 -14.556  < 2e-16 ***
## trimestre_documento   2.186103   0.132109  16.548  < 2e-16 ***
## anio_documento       -0.236238   0.030278  -7.802 6.14e-15 ***
## estado_num            5.613796   0.178644  31.424  < 2e-16 ***
## terms_grupoZ521       0.781897   0.283029   2.763 0.005735 ** 
## terms_grupoZ522       0.934178   0.356563   2.620 0.008796 ** 
## terms_grupoZ526      -0.523661   0.267871  -1.955 0.050597 .  
## terms_grupoZ540      -1.090826   0.328353  -3.322 0.000894 ***
## terms_grupoZ913      -4.869068   0.418688 -11.629  < 2e-16 ***
## terms_grupoZ914      -7.863899   0.554930 -14.171  < 2e-16 ***
## score_riesgo          0.816300   0.476233   1.714 0.086517 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.84 on 99126 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.3283, Adjusted R-squared:  0.3282 
## F-statistic:  3229 on 15 and 99126 DF,  p-value: < 2.2e-16

cat(sprintf("\n  AIC modelo completo  : %.2f\n", AIC(modelo_lm)))

## 
##   AIC modelo completo  : 753897.09

cat(sprintf("  AIC modelo stepwise  : %.2f\n",  AIC(modelo_lm_step)))

##   AIC modelo stepwise  : 753889.48

# ── 5.5 Mejor subconjunto (regsubsets) ───────────────────────────────────────
cat("\n── Mejor subconjunto (leaps::regsubsets) ──\n")

## 
## ── Mejor subconjunto (leaps::regsubsets) ──

# Construir X solo con variables numéricas para regsubsets
vars_num_lm <- c("log_monto","prom_mora_cliente","max_mora_cliente",
                 "freq_mora_cliente","desv_mora_cliente",
                 "dias_hasta_vencimiento","mes_documento",
                 "anio_documento","estado_num",
                 "log_monto_sq","score_riesgo")

X_lm <- df_train[, vars_num_lm] %>% drop_na() %>% as.matrix()
y_lm <- df_train$arrears_calc[complete.cases(df_train[, vars_num_lm])]

best_sub <- regsubsets(x = X_lm, y = y_lm,
                       nbest = 1, nvmax = 8, method = "exhaustive")
best_sum <- summary(best_sub)

cat("\nR² ajustado por número de variables:\n")

## 
## R² ajustado por número de variables:

print(round(best_sum$adjr2, 4))

## [1] 0.3185 0.3237 0.3241 0.3243 0.3245 0.3245 0.3246 0.3246

cat("\nBIC por número de variables:\n")

## 
## BIC por número de variables:

print(round(best_sum$bic,   3))

## [1] -37994.43 -38737.89 -38789.08 -38816.43 -38824.52 -38825.43 -38821.00
## [8] -38809.59

cat("\nCp por número de variables:\n")

## 
## Cp por número de variables:

print(round(best_sum$cp,    3))

## [1] 892.263 135.430  74.678  37.798  20.207   9.790   4.717   6.622

n_opt <- which.min(best_sum$bic)
cat(sprintf("\n✓ Variables óptimas según BIC mínimo: %d\n", n_opt))

## 
## ✓ Variables óptimas según BIC mínimo: 6

cat("  Variables seleccionadas:\n")

##   Variables seleccionadas:

print(best_sum$which[n_opt, ])

##            (Intercept)              log_monto      prom_mora_cliente 
##                   TRUE                  FALSE                   TRUE 
##       max_mora_cliente      freq_mora_cliente      desv_mora_cliente 
##                  FALSE                  FALSE                   TRUE 
## dias_hasta_vencimiento          mes_documento         anio_documento 
##                   TRUE                   TRUE                   TRUE 
##             estado_num           log_monto_sq           score_riesgo 
##                   TRUE                  FALSE                  FALSE

# ── 5.6 Predicción y métricas ────────────────────────────────────────────────
# predict(newdata = df_train) puede devolver menos filas que df_train si
# el modelo eliminó NAs internamente. Se usa fitted() para train y
# predict() solo para test (donde sí podemos controlar los NAs).
pred_lm_test  <- predict(modelo_lm_step, newdata = df_test)

# Para train: usar los valores ajustados directamente del modelo
# y los valores reales de las filas efectivamente usadas en el ajuste
idx_used_train <- as.integer(rownames(model.frame(modelo_lm_step)))
y_train_used   <- df_train$arrears_calc[idx_used_train]
pred_lm_train  <- fitted(modelo_lm_step)

RMSE <- function(y, yhat) sqrt(mean((y - yhat)^2, na.rm = TRUE))
MAE  <- function(y, yhat) mean(abs(y - yhat), na.rm = TRUE)
R2   <- function(y, yhat) cor(y, yhat, use = "complete.obs")^2

rmse_tr <- RMSE(y_train_used,         pred_lm_train)
rmse_te <- RMSE(df_test$arrears_calc, pred_lm_test)
mae_tr  <- MAE(y_train_used,          pred_lm_train)
mae_te  <- MAE(df_test$arrears_calc,  pred_lm_test)
r2_te   <- R2(df_test$arrears_calc,   pred_lm_test)

cat("\n── Métricas Regresión Lineal ──────────────────────\n")

## 
## ── Métricas Regresión Lineal ──────────────────────

cat(sprintf("  RMSE  Train : %.4f días\n", rmse_tr))

##   RMSE  Train : 15.3612 días

cat(sprintf("  RMSE  Test  : %.4f días  ← indicador principal\n", rmse_te))

##   RMSE  Test  : 8.8456 días  ← indicador principal

cat(sprintf("  MAE   Train : %.4f días\n", mae_tr))

##   MAE   Train : 8.2739 días

cat(sprintf("  MAE   Test  : %.4f días\n", mae_te))

##   MAE   Test  : 1.5695 días

cat(sprintf("  R²    Test  : %.6f\n",      r2_te))

##   R²    Test  : 0.406928

# Gráfico real vs predicho
p_lm <- data.frame(real = df_test$arrears_calc, pred = pred_lm_test) %>%
  ggplot(aes(x = real, y = pred)) +
  geom_point(alpha = 0.12, size = 0.7, color = C1) +
  geom_abline(slope = 1, intercept = 0, color = C2,
              linetype = "dashed", linewidth = 1) +
  geom_smooth(method = "lm", color = C4, linewidth = 0.8, se = FALSE) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(title    = "Regresión Lineal: Arrears Real vs Predicho",
       subtitle = sprintf("RMSE = %.4f días  |  R² = %.6f", rmse_te, r2_te),
       caption  = "Línea roja = predicción perfecta  |  Línea verde = tendencia real",
       x = "Arrears real (días)", y = "Arrears predicho (días)") +
  tema_base
print(p_lm)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Gráfico de residuos
# NOTA: fitted() y residuals() se extraen directamente del modelo para
# garantizar que ambos vectores tienen exactamente las mismas filas.
# lm() elimina internamente filas con NAs, por lo que predict(newdata=df_train)
# puede devolver más filas que residuals() — ese desajuste causaba el error.
p_resid <- data.frame(
  fitted = fitted(modelo_lm_step),      # valores ajustados del modelo
  resid  = residuals(modelo_lm_step)    # residuos del mismo modelo
) %>%
  ggplot(aes(x = fitted, y = resid)) +
  geom_point(alpha = 0.1, size = 0.6, color = C1) +
  geom_hline(yintercept = 0, color = C2, linetype = "dashed") +
  geom_smooth(method = "loess", color = C3, linewidth = 0.8, se = FALSE) +
  labs(title    = "Residuos vs Valores Ajustados",
       subtitle = "Sin patrón sistemático = buen ajuste",
       x = "Valores ajustados", y = "Residuos") +
  tema_base
print(p_resid)

## `geom_smooth()` using formula = 'y ~ x'

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 6 ▸ MODELO 2 — REGRESIÓN LOGÍSTICA (Pago_Oportuno_Bin — datos originales)
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 6 — MODELO 2: REGRESIÓN LOGÍSTICA (original)\n")

##   SECCIÓN 6 — MODELO 2: REGRESIÓN LOGÍSTICA (original)

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

cat("NOTA: Con 98,1% vs 1,9% de desbalance, este modelo sirve\n")

## NOTA: Con 98,1% vs 1,9% de desbalance, este modelo sirve

cat("como línea base. Los modelos con datos balanceados (SEC 8-9)\n")

## como línea base. Los modelos con datos balanceados (SEC 8-9)

cat("son los definitivos para comparación.\n\n")

## son los definitivos para comparación.

formula_logit <- as.formula(
  paste("pago_oportuno_bin ~",
        paste(c(VARS_CLAS, "score_riesgo"), collapse = " + "))
)

cat("Ajustando regresión logística (datos originales)...\n")

## Ajustando regresión logística (datos originales)...

modelo_logit <- glm(formula_logit,
                    data   = df_train,
                    family = binomial(link = "logit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

cat("\n── Resumen modelo logístico ──\n")

## 
## ── Resumen modelo logístico ──

print(summary(modelo_logit))

## 
## Call:
## glm(formula = formula_logit, family = binomial(link = "logit"), 
##     data = df_train)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             1.010e+02  1.593e+03   0.063  0.94949    
## log_monto              -7.922e-03  3.493e-02  -0.227  0.82057    
## prom_mora_cliente      -2.298e-01  5.653e-02  -4.064 4.82e-05 ***
## max_mora_cliente       -8.771e-03  4.061e-03  -2.160  0.03081 *  
## freq_mora_cliente       9.528e+00  3.081e+00   3.092  0.00199 ** 
## desv_mora_cliente       1.036e-01  5.702e-02   1.818  0.06911 .  
## dias_hasta_vencimiento  4.318e-01  2.819e-02  15.316  < 2e-16 ***
## mes_documento           1.126e+01  6.404e-01  17.587  < 2e-16 ***
## trimestre_documento    -5.032e+01  3.782e+02  -0.133  0.89414    
## estado_num             -9.665e+01  1.240e+03  -0.078  0.93788    
## terms_grupoZ521        -1.587e+00  2.658e+00  -0.597  0.55051    
## terms_grupoZ522        -2.076e+00  2.701e+00  -0.769  0.44212    
## terms_grupoZ526         2.139e+00  2.658e+00   0.805  0.42104    
## terms_grupoZ540        -2.224e+00  2.726e+00  -0.816  0.41464    
## terms_grupoZ913        -2.108e+00  2.717e+00  -0.776  0.43794    
## terms_grupoZ914         1.986e-01  2.778e+00   0.072  0.94299    
## score_riesgo           -4.755e+00  6.108e-01  -7.784 7.04e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19183.4  on 99141  degrees of freedom
## Residual deviance:  1941.5  on 99125  degrees of freedom
##   (4 observations deleted due to missingness)
## AIC: 1975.5
## 
## Number of Fisher Scoring iterations: 25

# Odds Ratios
cat("\n── Odds Ratios e IC 95% ──\n")

## 
## ── Odds Ratios e IC 95% ──

or_df <- data.frame(
  OR     = round(exp(coef(modelo_logit)), 4),
  IC_inf = round(exp(confint(modelo_logit))[, 1], 4),
  IC_sup = round(exp(confint(modelo_logit))[, 2], 4)
)

## Waiting for profiling to be done...

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Waiting for profiling to be done...

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

print(or_df)

##                                  OR     IC_inf        IC_sup
## (Intercept)            6.962089e+43        Inf 8.918136e+245
## log_monto              9.921000e-01     0.9265  1.062500e+00
## prom_mora_cliente      7.947000e-01     0.7076  8.843000e-01
## max_mora_cliente       9.913000e-01     0.9834  9.992000e-01
## freq_mora_cliente      1.373822e+04    38.8387  7.172688e+06
## desv_mora_cliente      1.109200e+00     0.9945  1.244400e+00
## dias_hasta_vencimiento 1.540000e+00     1.4600  1.630600e+00
## mes_documento          7.780116e+04 23663.6557  2.918319e+05
## trimestre_documento    0.000000e+00     0.0000  0.000000e+00
## estado_num             0.000000e+00     0.0000  0.000000e+00
## terms_grupoZ521        2.045000e-01     0.0054  6.546900e+00
## terms_grupoZ522        1.255000e-01     0.0027  4.734200e+00
## terms_grupoZ526        8.487600e+00     0.2587  3.108893e+02
## terms_grupoZ540        1.082000e-01     0.0021  4.542100e+00
## terms_grupoZ913        1.215000e-01     0.0025  5.044200e+00
## terms_grupoZ914        1.219700e+00     0.0219  7.040810e+01
## score_riesgo           8.600000e-03     0.0025  2.770000e-02

cat(sprintf("\n  AIC modelo logístico completo: %.2f\n", AIC(modelo_logit)))

## 
##   AIC modelo logístico completo: 1975.53

# Stepwise logístico
cat("\n── Stepwise logístico (AIC) ──\n")

## 
## ── Stepwise logístico (AIC) ──

modelo_logit_step <- step(modelo_logit, direction = "both", trace = 0)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

cat(sprintf("  AIC modelo stepwise: %.2f\n", AIC(modelo_logit_step)))

##   AIC modelo stepwise: 1973.58

# Evaluación en TEST
prob_logit   <- predict(modelo_logit_step, newdata = df_test, type = "response")
clase_logit  <- factor(if_else(prob_logit >= 0.5, "Oportuno", "Mora"),
                       levels = c("Mora", "Oportuno"))

cat("\n── Matriz de Confusión — Logística (umbral 0,5) ──\n")

## 
## ── Matriz de Confusión — Logística (umbral 0,5) ──

cm_logit <- confusionMatrix(data      = clase_logit,
                             reference = df_test$pago_oportuno_bin,
                             positive  = "Mora")
print(cm_logit)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Mora Oportuno
##   Mora       685       65
##   Oportuno   107    41633
##                                           
##                Accuracy : 0.996           
##                  95% CI : (0.9953, 0.9965)
##     No Information Rate : 0.9814          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8864          
##                                           
##  Mcnemar's Test P-Value : 0.001771        
##                                           
##             Sensitivity : 0.86490         
##             Specificity : 0.99844         
##          Pos Pred Value : 0.91333         
##          Neg Pred Value : 0.99744         
##              Prevalence : 0.01864         
##          Detection Rate : 0.01612         
##    Detection Prevalence : 0.01765         
##       Balanced Accuracy : 0.93167         
##                                           
##        'Positive' Class : Mora            
##

cat("\nTabla:\n");          print(cm_logit$table)

## 
## Tabla:

##           Reference
## Prediction  Mora Oportuno
##   Mora       685       65
##   Oportuno   107    41633

cat("\nMétricas clase:\n"); print(cm_logit$byClass)

## 
## Métricas clase:

##          Sensitivity          Specificity       Pos Pred Value 
##           0.86489899           0.99844117           0.91333333 
##       Neg Pred Value            Precision               Recall 
##           0.99743651           0.91333333           0.86489899 
##                   F1           Prevalence       Detection Rate 
##           0.88845655           0.01863968           0.01612144 
## Detection Prevalence    Balanced Accuracy 
##           0.01765121           0.93167008

cat("\nMétricas global:\n");print(cm_logit$overall)

## 
## Métricas global:

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   9.959520e-01   8.863967e-01   9.953013e-01   9.965334e-01   9.813603e-01 
## AccuracyPValue  McnemarPValue 
##  2.532200e-159   1.770686e-03

# Curva ROC
roc_df <- data.frame(
  real = as.integer(df_test$pago_oportuno_bin == "Mora"),
  prob = prob_logit
) %>%
  arrange(desc(prob)) %>%
  mutate(
    tpr = cumsum(real) / sum(real),
    fpr = cumsum(1 - real) / sum(1 - real)
  )

auc_logit <- with(roc_df, {
  n <- nrow(roc_df)
  abs(sum(diff(fpr) * (tpr[-n] + tpr[-1]) / 2))
})

p_roc1 <- ggplot(roc_df, aes(x = fpr, y = tpr)) +
  geom_line(color = C1, linewidth = 1.2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray60") +
  annotate("text", x = 0.6, y = 0.1,
           label = sprintf("AUC = %.4f", auc_logit),
           size = 4.5, fontface = "bold", color = C1) +
  scale_x_continuous(labels = percent) +
  scale_y_continuous(labels = percent) +
  labs(title    = "ROC — Logística (datos originales)",
       subtitle = "Clase positiva: Mora",
       x = "Tasa Falsos Positivos", y = "Tasa Verdaderos Positivos") +
  tema_base
print(p_roc1)

cat(sprintf("\n  AUC logística (original): %.4f\n", auc_logit))

## 
##   AUC logística (original): 0.0008

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 7 ▸ BALANCEO DE CLASES
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 7 — BALANCEO DE CLASES\n")

##   SECCIÓN 7 — BALANCEO DE CLASES

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

cat("Distribución ORIGINAL en train:\n")

## Distribución ORIGINAL en train:

print(table(df_train$pago_oportuno_bin))

## 
##     Mora Oportuno 
##     1950    97196

print(prop.table(table(df_train$pago_oportuno_bin)))

## 
##       Mora   Oportuno 
## 0.01966796 0.98033204

# Variables para balanceo (solo numéricas)
VARS_BAL <- c("pago_oportuno_bin", "log_monto", "prom_mora_cliente",
              "max_mora_cliente", "freq_mora_cliente", "desv_mora_cliente",
              "dias_hasta_vencimiento", "mes_documento",
              "estado_num", "score_riesgo")

# Muestra estratificada para balanceo (eficiencia computacional)
# Tomar TODOS los casos de mora + muestra de oportuno
idx_mora     <- which(df_train$pago_oportuno_bin == "Mora")
idx_oportuno <- which(df_train$pago_oportuno_bin == "Oportuno")

set.seed(42)
# Submuestreo del oportuno: 10x los casos de mora para balanceo razonable
n_oportuno_sample <- min(length(idx_oportuno), length(idx_mora) * 10)
idx_oportuno_sel  <- sample(idx_oportuno, n_oportuno_sample)

df_bal_input <- df_train[c(idx_mora, idx_oportuno_sel), VARS_BAL] %>%
  mutate(pago_oportuno_bin = as.factor(as.character(pago_oportuno_bin))) %>%
  drop_na()

cat("\nMuestra para balanceo:\n")

## 
## Muestra para balanceo:

cat("  Mora     :", sum(df_bal_input$pago_oportuno_bin == "Mora"), "\n")

##   Mora     : 1950

cat("  Oportuno :", sum(df_bal_input$pago_oportuno_bin == "Oportuno"), "\n")

##   Oportuno : 19500

# ── 7.1 ROSE ─────────────────────────────────────────────────────────────────
cat("\n── Método 1: ROSE (Random Over-Sampling Examples) ──\n")

## 
## ── Método 1: ROSE (Random Over-Sampling Examples) ──

set.seed(42)
df_rose <- ovun.sample(
  pago_oportuno_bin ~ .,
  data   = df_bal_input,
  method = "both",
  p      = 0.5,
  N      = nrow(df_bal_input),
  seed   = 42
)$data

cat("Distribución después de ROSE:\n")

## Distribución después de ROSE:

print(table(df_rose$pago_oportuno_bin))

## 
## Oportuno     Mora 
##    10715    10735

print(prop.table(table(df_rose$pago_oportuno_bin)))

## 
##  Oportuno      Mora 
## 0.4995338 0.5004662

# ── 7.2 SMOTE ────────────────────────────────────────────────────────────────
cat("\n── Método 2: SMOTE (Synthetic Minority Over-sampling) ──\n")

## 
## ── Método 2: SMOTE (Synthetic Minority Over-sampling) ──

df_smote_input <- df_bal_input %>%
  mutate(target_num = as.integer(pago_oportuno_bin == "Mora")) %>%
  select(-pago_oportuno_bin) %>%
  as.data.frame()

set.seed(42)
smote_res <- SMOTE(
  X        = df_smote_input[, names(df_smote_input) != "target_num"],
  target   = df_smote_input$target_num,
  K        = 5,
  dup_size = 0
)

df_smote <- smote_res$data %>%
  as.data.frame() %>%
  rename(target_num = class) %>%
  mutate(pago_oportuno_bin = factor(
    if_else(as.numeric(as.character(target_num)) == 1, "Mora", "Oportuno"),
    levels = c("Mora", "Oportuno")
  )) %>%
  select(-target_num)

colnames(df_smote) <- make.names(colnames(df_smote))

cat("Distribución después de SMOTE:\n")

## Distribución después de SMOTE:

print(table(df_smote$pago_oportuno_bin))

## 
##     Mora Oportuno 
##    19500    19500

print(prop.table(table(df_smote$pago_oportuno_bin)))

## 
##     Mora Oportuno 
##      0.5      0.5

# Gráfico comparativo de distribuciones
par(mfrow = c(1, 3))
barplot(table(df_bal_input$pago_oportuno_bin),
        main = "ORIGINAL (muestra)", col = c(C2, C4), border = NA,
        names.arg = c("Mora", "Oportuno"),
        ylab = "Registros")
barplot(table(df_rose$pago_oportuno_bin),
        main = "ROSE", col = c(C2, C4), border = NA,
        names.arg = c("Mora", "Oportuno"))
barplot(table(df_smote$pago_oportuno_bin),
        main = "SMOTE", col = c(C2, C4), border = NA,
        names.arg = c("Mora", "Oportuno"))

par(mfrow = c(1, 1))


# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 8 ▸ MODELO 3 — REGRESIÓN LOGÍSTICA CON DATOS BALANCEADOS (ROSE)
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 8 — MODELO 3: LOGÍSTICA + ROSE\n")

##   SECCIÓN 8 — MODELO 3: LOGÍSTICA + ROSE

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# Fórmula para datos balanceados (sin terms_grupo por restricción de variables)
VARS_BAL_CLAS <- c("log_monto","prom_mora_cliente","max_mora_cliente",
                   "freq_mora_cliente","desv_mora_cliente",
                   "dias_hasta_vencimiento","mes_documento",
                   "estado_num","score_riesgo")

formula_bal <- as.formula(
  paste("pago_oportuno_bin ~",
        paste(intersect(VARS_BAL_CLAS, names(df_rose)), collapse = " + "))
)

cat("Ajustando logística con datos ROSE...\n")

## Ajustando logística con datos ROSE...

modelo_logit_rose <- glm(formula_bal,
                          data   = df_rose,
                          family = binomial(link = "logit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

cat("\n── Resumen modelo logístico (ROSE) ──\n")

## 
## ── Resumen modelo logístico (ROSE) ──

print(summary(modelo_logit_rose))

## 
## Call:
## glm(formula = formula_bal, family = binomial(link = "logit"), 
##     data = df_rose)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -24.235706 267.171939  -0.091    0.928    
## log_monto               -0.020509   0.038021  -0.539    0.590    
## prom_mora_cliente        0.149331   0.015776   9.466  < 2e-16 ***
## max_mora_cliente         0.014948   0.001607   9.301  < 2e-16 ***
## freq_mora_cliente        6.431393   6.656123   0.966    0.334    
## desv_mora_cliente       -0.027596   0.020252  -1.363    0.173    
## dias_hasta_vencimiento  -0.048665   0.008766  -5.551 2.84e-08 ***
## mes_documento           -0.697432   0.066285 -10.522  < 2e-16 ***
## estado_num              30.970873 267.171157   0.116    0.908    
## score_riesgo             5.187773   0.682323   7.603 2.89e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29736.0  on 21449  degrees of freedom
## Residual deviance:  2178.4  on 21440  degrees of freedom
## AIC: 2198.4
## 
## Number of Fisher Scoring iterations: 21

# Stepwise
modelo_logit_rose_step <- step(modelo_logit_rose, direction = "both", trace = 0)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

cat(sprintf("\n  AIC logística ROSE completo  : %.2f\n", AIC(modelo_logit_rose)))

## 
##   AIC logística ROSE completo  : 2198.44

cat(sprintf("  AIC logística ROSE stepwise  : %.2f\n",  AIC(modelo_logit_rose_step)))

##   AIC logística ROSE stepwise  : 2195.35

# Evaluación en TEST original (sin balancear — rendimiento real)
prob_rose  <- predict(modelo_logit_rose_step, newdata = df_test, type = "response")
clase_rose <- factor(if_else(prob_rose >= 0.5, "Mora", "Oportuno"),
                     levels = c("Mora", "Oportuno"))

cat("\n── Matriz de Confusión — Logística ROSE (test original) ──\n")

## 
## ── Matriz de Confusión — Logística ROSE (test original) ──

cm_rose <- confusionMatrix(data      = clase_rose,
                            reference = df_test$pago_oportuno_bin,
                            positive  = "Mora")
print(cm_rose)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Mora Oportuno
##   Mora       790     1019
##   Oportuno     2    40681
##                                           
##                Accuracy : 0.976           
##                  95% CI : (0.9745, 0.9774)
##     No Information Rate : 0.9814          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.597           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99747         
##             Specificity : 0.97556         
##          Pos Pred Value : 0.43671         
##          Neg Pred Value : 0.99995         
##              Prevalence : 0.01864         
##          Detection Rate : 0.01859         
##    Detection Prevalence : 0.04257         
##       Balanced Accuracy : 0.98652         
##                                           
##        'Positive' Class : Mora            
##

cat("\nTabla:\n");          print(cm_rose$table)

## 
## Tabla:

##           Reference
## Prediction  Mora Oportuno
##   Mora       790     1019
##   Oportuno     2    40681

cat("\nMétricas clase:\n"); print(cm_rose$byClass)

## 
## Métricas clase:

##          Sensitivity          Specificity       Pos Pred Value 
##           0.99747475           0.97556355           0.43670536 
##       Neg Pred Value            Precision               Recall 
##           0.99995084           0.43670536           0.99747475 
##                   F1           Prevalence       Detection Rate 
##           0.60745867           0.01863880           0.01859173 
## Detection Prevalence    Balanced Accuracy 
##           0.04257272           0.98651915

cat("\nMétricas global:\n");print(cm_rose$overall)

## 
## Métricas global:

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   9.759719e-01   5.970105e-01   9.744712e-01   9.774072e-01   9.813612e-01 
## AccuracyPValue  McnemarPValue 
##   1.000000e+00  7.210296e-222

# ROC logística ROSE
roc_rose <- data.frame(
  real = as.integer(df_test$pago_oportuno_bin == "Mora"),
  prob = prob_rose
) %>%
  arrange(desc(prob)) %>%
  mutate(
    tpr = cumsum(real) / sum(real),
    fpr = cumsum(1 - real) / sum(1 - real)
  )
auc_rose <- with(roc_rose, abs(sum(diff(fpr) * (tpr[-nrow(roc_rose)] + tpr[-1]) / 2)))
cat(sprintf("\n  AUC logística ROSE: %.4f\n", auc_rose))

## 
##   AUC logística ROSE: 0.9966

p_roc2 <- ggplot(roc_rose, aes(x = fpr, y = tpr)) +
  geom_line(color = C4, linewidth = 1.2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray60") +
  annotate("text", x = 0.6, y = 0.1,
           label = sprintf("AUC = %.4f", auc_rose),
           size = 4.5, fontface = "bold", color = C4) +
  scale_x_continuous(labels = percent) +
  scale_y_continuous(labels = percent) +
  labs(title    = "ROC — Logística con ROSE",
       subtitle = "Clase positiva: Mora  |  Test original (sin balancear)",
       x = "Tasa Falsos Positivos", y = "Tasa Verdaderos Positivos") +
  tema_base
print(p_roc2)

# Explorar umbral óptimo para maximizar F1 en clase Mora
umbrales   <- seq(0.01, 0.99, by = 0.01)
f1_umbral  <- sapply(umbrales, function(u) {
  pred_u <- factor(if_else(prob_rose >= u, "Mora", "Oportuno"),
                   levels = c("Mora", "Oportuno"))
  cm_u   <- suppressWarnings(
    confusionMatrix(pred_u, df_test$pago_oportuno_bin, positive = "Mora"))
  unname(cm_u$byClass["F1"])
})

umbral_optimo <- umbrales[which.max(replace(f1_umbral, is.na(f1_umbral), 0))]
cat(sprintf("\n  Umbral óptimo para maximizar F1 (Mora): %.2f\n", umbral_optimo))

## 
##   Umbral óptimo para maximizar F1 (Mora): 0.97

cat(sprintf("  F1 máximo                              : %.4f\n",
            max(f1_umbral, na.rm = TRUE)))

##   F1 máximo                              : 0.7540

clase_rose_opt <- factor(if_else(prob_rose >= umbral_optimo, "Mora", "Oportuno"),
                          levels = c("Mora", "Oportuno"))
cat("\n── Matriz de Confusión — ROSE (umbral óptimo) ──\n")

## 
## ── Matriz de Confusión — ROSE (umbral óptimo) ──

cm_rose_opt <- confusionMatrix(clase_rose_opt, df_test$pago_oportuno_bin,
                                positive = "Mora")
print(cm_rose_opt)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Mora Oportuno
##   Mora       633      254
##   Oportuno   159    41446
##                                           
##                Accuracy : 0.9903          
##                  95% CI : (0.9893, 0.9912)
##     No Information Rate : 0.9814          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7491          
##                                           
##  Mcnemar's Test P-Value : 3.738e-06       
##                                           
##             Sensitivity : 0.79924         
##             Specificity : 0.99391         
##          Pos Pred Value : 0.71364         
##          Neg Pred Value : 0.99618         
##              Prevalence : 0.01864         
##          Detection Rate : 0.01490         
##    Detection Prevalence : 0.02087         
##       Balanced Accuracy : 0.89658         
##                                           
##        'Positive' Class : Mora            
##

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 9 ▸ MODELO 4 — k-NN CON DATOS BALANCEADOS
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 9 — MODELO 4: k-NN\n")

##   SECCIÓN 9 — MODELO 4: k-NN

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# Variables numéricas para kNN (requiere solo numéricas)
VARS_KNN <- intersect(
  c("log_monto","prom_mora_cliente","max_mora_cliente",
    "freq_mora_cliente","desv_mora_cliente",
    "dias_hasta_vencimiento","mes_documento",
    "estado_num","score_riesgo"),
  names(df_smote)
)

# Preparar matrices de entrenamiento (SMOTE) y test (original)
X_train_knn <- df_smote %>%
  select(all_of(make.names(VARS_KNN))) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na() %>%
  as.matrix()

y_train_knn <- df_smote$pago_oportuno_bin[
  complete.cases(df_smote %>% select(all_of(make.names(VARS_KNN))))
]

X_test_knn <- df_test %>%
  select(all_of(VARS_KNN)) %>%
  mutate(across(everything(), as.numeric)) %>%
  as.matrix()

y_test_knn <- df_test$pago_oportuno_bin

# Normalización min-max (OBLIGATORIA para kNN)
norm_minmax <- function(x) {
  mn <- min(x, na.rm = TRUE); mx <- max(x, na.rm = TRUE)
  if (mx == mn) return(rep(0, length(x)))
  (x - mn) / (mx - mn)
}

X_train_norm <- apply(X_train_knn, 2, norm_minmax)
X_test_norm  <- apply(X_test_knn,  2, norm_minmax)

# Eliminar NAs del conjunto de test
idx_ok        <- complete.cases(X_test_norm)
X_test_norm   <- X_test_norm[idx_ok, ]
y_test_knn    <- y_test_knn[idx_ok]

cat("  Train kNN (SMOTE)  :", nrow(X_train_norm), "registros\n")

##   Train kNN (SMOTE)  : 39000 registros

cat("  Test kNN (original):", nrow(X_test_norm),  "registros\n\n")

##   Test kNN (original): 42490 registros

# ── 9.1 Selección del K óptimo ────────────────────────────────────────────────
cat("Evaluando K = 1, 3, 5, 7, 9, 11, 13, 15...\n")

## Evaluando K = 1, 3, 5, 7, 9, 11, 13, 15...

k_vals   <- c(1, 3, 5, 7, 9, 11, 13, 15)
acc_knn  <- numeric(length(k_vals))
f1_knn_v <- numeric(length(k_vals))
kap_knn  <- numeric(length(k_vals))

for (i in seq_along(k_vals)) {
  set.seed(42)
  pred_k <- knn(train = X_train_norm, test = X_test_norm,
                cl = y_train_knn, k = k_vals[i])
  pred_k <- factor(pred_k, levels = c("Mora","Oportuno"))
  cm_k   <- suppressWarnings(
    confusionMatrix(pred_k, y_test_knn, positive = "Mora"))
  acc_knn[i]  <- cm_k$overall["Accuracy"]
  f1_knn_v[i] <- cm_k$byClass["F1"]
  kap_knn[i]  <- cm_k$overall["Kappa"]
  cat(sprintf("  K = %2d | Accuracy = %.4f | F1 = %.4f | Kappa = %.4f\n",
              k_vals[i], acc_knn[i],
              ifelse(is.na(f1_knn_v[i]), 0, f1_knn_v[i]),
              kap_knn[i]))
}

##   K =  1 | Accuracy = 0.9810 | F1 = 0.6535 | Kappa = 0.6448
##   K =  3 | Accuracy = 0.9725 | F1 = 0.5735 | Kappa = 0.5619
##   K =  5 | Accuracy = 0.9708 | F1 = 0.5591 | Kappa = 0.5470
##   K =  7 | Accuracy = 0.9704 | F1 = 0.5572 | Kappa = 0.5450
##   K =  9 | Accuracy = 0.9699 | F1 = 0.5529 | Kappa = 0.5405
##   K = 11 | Accuracy = 0.9696 | F1 = 0.5510 | Kappa = 0.5385
##   K = 13 | Accuracy = 0.9693 | F1 = 0.5485 | Kappa = 0.5359
##   K = 15 | Accuracy = 0.9694 | F1 = 0.5489 | Kappa = 0.5363

# K óptimo: mayor Kappa (robusto al desbalance)
k_opt <- k_vals[which.max(kap_knn)]
cat(sprintf("\n✓ K óptimo: %d  (Kappa = %.4f)\n", k_opt, max(kap_knn)))

## 
## ✓ K óptimo: 1  (Kappa = 0.6448)

# Gráfico K vs métricas
p_knn_k <- data.frame(
  K        = rep(k_vals, 3),
  Valor    = c(acc_knn, replace(f1_knn_v, is.na(f1_knn_v), 0), kap_knn),
  Metrica  = rep(c("Accuracy","F1","Kappa"), each = length(k_vals))
) %>%
  ggplot(aes(x = K, y = Valor, color = Metrica, group = Metrica)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3) +
  geom_vline(xintercept = k_opt, linetype = "dashed", color = C3, linewidth = 0.9) +
  annotate("text", x = k_opt + 0.4, y = 0.05,
           label = paste("K =", k_opt), color = C3, size = 3.8, fontface = "bold") +
  scale_color_manual(values = c("Accuracy" = C1, "F1" = C4, "Kappa" = C5)) +
  scale_x_continuous(breaks = k_vals) +
  labs(title    = "k-NN: Métricas por Valor de K",
       subtitle = "Datos SMOTE (train) → Test original (sin balancear)",
       x = "K vecinos", y = "Valor de la métrica") +
  tema_base
print(p_knn_k)

# ── 9.2 Modelo kNN final ─────────────────────────────────────────────────────
set.seed(42)
pred_knn <- knn(train = X_train_norm, test = X_test_norm,
                cl = y_train_knn, k = k_opt)
pred_knn <- factor(pred_knn, levels = c("Mora","Oportuno"))

cat(sprintf("\n── Matriz de Confusión — k-NN (K = %d, SMOTE) ──\n", k_opt))

## 
## ── Matriz de Confusión — k-NN (K = 1, SMOTE) ──

cm_knn <- confusionMatrix(pred_knn, y_test_knn, positive = "Mora")
print(cm_knn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Mora Oportuno
##   Mora       762      778
##   Oportuno    30    40920
##                                           
##                Accuracy : 0.981           
##                  95% CI : (0.9796, 0.9823)
##     No Information Rate : 0.9814          
##     P-Value [Acc > NIR] : 0.7243          
##                                           
##                   Kappa : 0.6448          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.96212         
##             Specificity : 0.98134         
##          Pos Pred Value : 0.49481         
##          Neg Pred Value : 0.99927         
##              Prevalence : 0.01864         
##          Detection Rate : 0.01793         
##    Detection Prevalence : 0.03624         
##       Balanced Accuracy : 0.97173         
##                                           
##        'Positive' Class : Mora            
##

cat("\nTabla:\n");          print(cm_knn$table)

## 
## Tabla:

##           Reference
## Prediction  Mora Oportuno
##   Mora       762      778
##   Oportuno    30    40920

cat("\nMétricas clase:\n"); print(cm_knn$byClass)

## 
## Métricas clase:

##          Sensitivity          Specificity       Pos Pred Value 
##           0.96212121           0.98134203           0.49480519 
##       Neg Pred Value            Precision               Recall 
##           0.99926740           0.49480519           0.96212121 
##                   F1           Prevalence       Detection Rate 
##           0.65351630           0.01863968           0.01793363 
## Detection Prevalence    Balanced Accuracy 
##           0.03624382           0.97173162

cat("\nMétricas global:\n");print(cm_knn$overall)

## 
## Métricas global:

##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   9.809838e-01   6.447711e-01   9.796400e-01   9.822612e-01   9.813603e-01 
## AccuracyPValue  McnemarPValue 
##   7.242724e-01  3.301302e-152

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 10 ▸ COMPARACIÓN Y SELECCIÓN DE MODELOS
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 10 — COMPARACIÓN DE MODELOS\n")

##   SECCIÓN 10 — COMPARACIÓN DE MODELOS

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# ── Extraer métricas de cada modelo de clasificación ─────────────────────────
extraer_metricas <- function(cm, nombre, auc = NA) {
  data.frame(
    Modelo     = nombre,
    Accuracy   = round(cm$overall["Accuracy"],     4),
    Kappa      = round(cm$overall["Kappa"],         4),
    Precision  = round(cm$byClass["Precision"],     4),
    Recall     = round(cm$byClass["Sensitivity"],   4),
    F1         = round(cm$byClass["F1"],            4),
    Specificity= round(cm$byClass["Specificity"],   4),
    AUC        = round(auc, 4),
    row.names  = NULL
  )
}

tab_comp <- bind_rows(
  extraer_metricas(cm_logit,    "Logística (original,  u=0.50)", auc_logit),
  extraer_metricas(cm_rose,     "Logística (ROSE,      u=0.50)", auc_rose),
  extraer_metricas(cm_rose_opt, sprintf("Logística (ROSE,      u=%.2f)", umbral_optimo), auc_rose),
  extraer_metricas(cm_knn,      sprintf("k-NN (SMOTE,      K=%d)", k_opt))
)

cat("── Tabla comparativa de modelos de clasificación ──\n")

## ── Tabla comparativa de modelos de clasificación ──

print(tab_comp, digits = 4)

##                          Modelo Accuracy  Kappa Precision Recall     F1
## 1 Logística (original,  u=0.50)   0.9960 0.8864    0.9133 0.8649 0.8885
## 2 Logística (ROSE,      u=0.50)   0.9760 0.5970    0.4367 0.9975 0.6075
## 3 Logística (ROSE,      u=0.97)   0.9903 0.7491    0.7136 0.7992 0.7540
## 4        k-NN (SMOTE,      K=1)   0.9810 0.6448    0.4948 0.9621 0.6535
##   Specificity    AUC
## 1      0.9984 0.0008
## 2      0.9756 0.9966
## 3      0.9939 0.9966
## 4      0.9813     NA

cat("\n── Regresión Lineal (Arrears_calc) ──\n")

## 
## ── Regresión Lineal (Arrears_calc) ──

cat(sprintf("  RMSE Test  : %.4f días\n", rmse_te))

##   RMSE Test  : 8.8456 días

cat(sprintf("  MAE  Test  : %.4f días\n", mae_te))

##   MAE  Test  : 1.5695 días

cat(sprintf("  R²   Test  : %.6f\n",      r2_te))

##   R²   Test  : 0.406928

cat(sprintf("  AIC step.  : %.2f\n",      AIC(modelo_lm_step)))

##   AIC step.  : 753889.48

# Modelo recomendado (mejor F1 en clase Mora)
mejor_idx <- which.max(replace(tab_comp$F1, is.na(tab_comp$F1), 0))
cat("\n✓ MODELO RECOMENDADO (mayor F1 en clase Mora):\n")

## 
## ✓ MODELO RECOMENDADO (mayor F1 en clase Mora):

cat("  →", tab_comp$Modelo[mejor_idx], "\n")

##   → Logística (original,  u=0.50)

cat(sprintf("    Accuracy=%.4f | F1=%.4f | Kappa=%.4f | Recall=%.4f\n",
            tab_comp$Accuracy[mejor_idx],
            tab_comp$F1[mejor_idx],
            tab_comp$Kappa[mejor_idx],
            tab_comp$Recall[mejor_idx]))

##     Accuracy=0.9960 | F1=0.8885 | Kappa=0.8864 | Recall=0.8649

# Gráfico comparativo
p_comp <- tab_comp %>%
  select(Modelo, Accuracy, Precision, Recall, F1, Kappa) %>%
  pivot_longer(-Modelo, names_to = "Metrica", values_to = "Valor") %>%
  filter(!is.na(Valor)) %>%
  ggplot(aes(x = Metrica, y = Valor, fill = Modelo)) +
  geom_col(position = "dodge", alpha = 0.85) +
  geom_text(aes(label = round(Valor, 3)),
            position = position_dodge(width = 0.9),
            vjust = -0.3, size = 2.7, fontface = "bold") +
  scale_fill_manual(values = c(C1, C4, C3, C5)) +
  scale_y_continuous(limits = c(0, 1.15), labels = percent) +
  labs(title    = "Comparación de Modelos de Clasificación",
       subtitle = "Evaluación en test original (datos sin balancear)",
       x = NULL, y = "Valor") +
  tema_base +
  theme(legend.position = "right",
        legend.text      = element_text(size = 8))
print(p_comp)

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 11 ▸ PANEL GRÁFICO CONSOLIDADO
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 11 — PANEL GRÁFICO CONSOLIDADO\n")

##   SECCIÓN 11 — PANEL GRÁFICO CONSOLIDADO

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

# Curvas ROC superpuestas
p_roc_all <- bind_rows(
  roc_df   %>% mutate(Modelo = "Logística original"),
  roc_rose %>% mutate(Modelo = "Logística ROSE")
) %>%
  ggplot(aes(x = fpr, y = tpr, color = Modelo)) +
  geom_line(linewidth = 1.2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray60") +
  annotate("text", x = 0.55, y = 0.12,
           label = sprintf("AUC orig = %.4f\nAUC ROSE = %.4f",
                           auc_logit, auc_rose),
           size = 3.8, fontface = "bold", color = "gray30") +
  scale_color_manual(values = c("Logística original" = C1,
                                 "Logística ROSE"     = C4)) +
  scale_x_continuous(labels = percent) +
  scale_y_continuous(labels = percent) +
  labs(title = "Curvas ROC — Comparación",
       x = "FPR", y = "TPR") +
  tema_base

panel_final <- (p_target | p_bkt) /
               (p_roc_all | p_knn_k) /
               (p_lm | p_comp) +
  plot_annotation(
    title    = "Panel Consolidado — Paso 4: Modelado Predictivo I2C",
    subtitle = paste0(
      formatC(nrow(df_work), big.mark=","), " registros RV  ·  ",
      "265 clientes  ·  Logística + k-NN + Regresión  ·  ",
      format(Sys.time(), "%d/%m/%Y")
    ),
    caption  = "Fuente: Base_Modelado_I2C_Lista_1.xlsx  |  Solo facturas RV",
    theme    = theme(
      plot.title    = element_text(face = "bold", size = 15, color = C1),
      plot.subtitle = element_text(size = 10, color = "gray40"),
      plot.caption  = element_text(size = 8, color = "gray55")
    )
  )
print(panel_final)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 12 ▸ EXPORTACIÓN DE RESULTADOS
# ───────────────────────────────────────────────────────────────────────────────

cat("\n═══════════════════════════════════════════════════════\n")

## 
## ═══════════════════════════════════════════════════════

cat("  SECCIÓN 12 — EXPORTACIÓN\n")

##   SECCIÓN 12 — EXPORTACIÓN

cat("═══════════════════════════════════════════════════════\n\n")

## ═══════════════════════════════════════════════════════

dir.create("output_modelado", recursive = TRUE, showWarnings = FALSE)

# Predicciones del mejor modelo en todo el dataset de trabajo
prob_final   <- predict(modelo_logit_rose_step,
                        newdata  = df_work, type = "response")
clase_final  <- factor(if_else(prob_final >= umbral_optimo, "Mora", "Oportuno"),
                       levels = c("Mora", "Oportuno"))

df_pred_export <- df_work %>%
  select(anon_customer_id, anon_document_id, document_date,
         terms_of_payment, amount_in_local_currency,
         arrears_calc, pago_oportuno_bin, bucket_mora_calc,
         prom_mora_cliente, freq_mora_cliente) %>%
  mutate(
    prob_mora           = round(prob_final, 4),
    pred_clase          = clase_final,
    clasificacion_ok    = if_else(pred_clase == pago_oportuno_bin,
                                  "Correcta", "Error"),
    nivel_riesgo        = case_when(
      prob_final >= 0.70 ~ "RIESGO ALTO",
      prob_final >= 0.40 ~ "RIESGO MEDIO",
      prob_final >= 0.20 ~ "RIESGO BAJO",
      TRUE               ~ "SIN RIESGO"
    )
  )

# Exportar CSVs
write.csv(df_pred_export, "output_modelado/predicciones_modelo_final.csv",
          row.names = FALSE, fileEncoding = "UTF-8")
cat("✓ Predicciones exportadas\n")

## ✓ Predicciones exportadas

write.csv(tab_comp, "output_modelado/tabla_comparacion_modelos.csv",
          row.names = FALSE)
cat("✓ Tabla comparativa exportada\n")

## ✓ Tabla comparativa exportada

coef_export <- data.frame(
  Variable   = names(coef(modelo_logit_rose_step)),
  Coeficiente= round(coef(modelo_logit_rose_step), 4),
  Odds_Ratio = round(exp(coef(modelo_logit_rose_step)), 4)
)
write.csv(coef_export, "output_modelado/coeficientes_logistica_rose.csv",
          row.names = FALSE)
cat("✓ Coeficientes exportados\n")

## ✓ Coeficientes exportados

# Guardar gráficos
ggsave("output_modelado/panel_consolidado.png",
       plot = panel_final, width = 20, height = 16, dpi = 150, bg = "white")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave("output_modelado/comparacion_modelos.png",
       plot = p_comp, width = 12, height = 6, dpi = 150, bg = "white")
ggsave("output_modelado/curvas_roc.png",
       plot = p_roc_all, width = 8, height = 6, dpi = 150, bg = "white")
ggsave("output_modelado/knn_k_selection.png",
       plot = p_knn_k, width = 9, height = 5, dpi = 150, bg = "white")
ggsave("output_modelado/regresion_real_vs_predicho.png",
       plot = p_lm, width = 8, height = 6, dpi = 150, bg = "white")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

cat("✓ Gráficos exportados en ./output_modelado/\n")

## ✓ Gráficos exportados en ./output_modelado/

# ── Resumen final ─────────────────────────────────────────────────────────────
cat("\n", strrep("═", 60), "\n", sep="")

## 
## ════════════════════════════════════════════════════════════

cat("  RESUMEN FINAL — PASO 4 MODELADO PREDICTIVO I2C\n")

##   RESUMEN FINAL — PASO 4 MODELADO PREDICTIVO I2C

cat(strrep("═", 60), "\n\n", sep="")

## ════════════════════════════════════════════════════════════

cat("  DATASET\n")

##   DATASET

cat(sprintf("    Registros    : %s (solo RV)\n",
            formatC(nrow(df_work), big.mark = ",")))

##     Registros    : 141,638 (solo RV)

cat(sprintf("    Clientes     : %d\n", n_distinct(df$anon_customer_id)))

##     Clientes     : 265

cat(sprintf("    Target 0(Mora): %s (%.1f%%)\n",
            formatC(sum(df_work$pago_oportuno_bin=="Mora"), big.mark=","),
            mean(df_work$pago_oportuno_bin=="Mora")*100))

##     Target 0(Mora): 2,742 (1.9%)

cat(sprintf("    Target 1(Op.) : %s (%.1f%%)\n\n",
            formatC(sum(df_work$pago_oportuno_bin=="Oportuno"), big.mark=","),
            mean(df_work$pago_oportuno_bin=="Oportuno")*100))

##     Target 1(Op.) : 138,896 (98.1%)

cat("  CLASIFICACIÓN (Pago_Oportuno_Bin — clase positiva: Mora)\n")

##   CLASIFICACIÓN (Pago_Oportuno_Bin — clase positiva: Mora)

cat("  ┌──────────────────────────────┬──────────┬──────────┬──────────┬──────────┐\n")

##   ┌──────────────────────────────┬──────────┬──────────┬──────────┬──────────┐

cat("  │ Modelo                       │ Accuracy │    F1    │  Kappa   │   AUC    │\n")

##   │ Modelo                       │ Accuracy │    F1    │  Kappa   │   AUC    │

cat("  ├──────────────────────────────┼──────────┼──────────┼──────────┼──────────┤\n")

##   ├──────────────────────────────┼──────────┼──────────┼──────────┼──────────┤

for (i in seq_len(nrow(tab_comp))) {
  cat(sprintf("  │ %-28s │  %.4f  │  %.4f  │  %.4f  │  %s  │\n",
              substr(tab_comp$Modelo[i], 1, 28),
              tab_comp$Accuracy[i],
              ifelse(is.na(tab_comp$F1[i]), 0, tab_comp$F1[i]),
              tab_comp$Kappa[i],
              ifelse(is.na(tab_comp$AUC[i]), "  N/A ", sprintf("%.4f", tab_comp$AUC[i]))))
}

##   │ Logística (original,  u=0.50 │  0.9960  │  0.8885  │  0.8864  │  0.0008  │
##   │ Logística (ROSE,      u=0.50 │  0.9760  │  0.6075  │  0.5970  │  0.9966  │
##   │ Logística (ROSE,      u=0.97 │  0.9903  │  0.7540  │  0.7491  │  0.9966  │
##   │ k-NN (SMOTE,      K=1)       │  0.9810  │  0.6535  │  0.6448  │    N/A   │

cat("  └──────────────────────────────┴──────────┴──────────┴──────────┴──────────┘\n")

##   └──────────────────────────────┴──────────┴──────────┴──────────┴──────────┘

cat("\n  REGRESIÓN LINEAL (Arrears_calc)\n")

## 
##   REGRESIÓN LINEAL (Arrears_calc)

cat(sprintf("    RMSE Test  : %.4f días\n", rmse_te))

##     RMSE Test  : 8.8456 días

cat(sprintf("    MAE  Test  : %.4f días\n", mae_te))

##     MAE  Test  : 1.5695 días

cat(sprintf("    R²   Test  : %.6f\n",      r2_te))

##     R²   Test  : 0.406928

cat(sprintf("\n  MODELO RECOMENDADO: %s\n", tab_comp$Modelo[mejor_idx]))

## 
##   MODELO RECOMENDADO: Logística (original,  u=0.50)

cat(sprintf("    Umbral de decisión: %.2f\n", umbral_optimo))

##     Umbral de decisión: 0.97

cat("\n  Archivos generados en ./output_modelado/\n")

## 
##   Archivos generados en ./output_modelado/

cat(strrep("═", 60), "\n", sep="")

## ════════════════════════════════════════════════════════════

cat("✅  PASO 4 COMPLETADO\n\n")

## ✅  PASO 4 COMPLETADO

# ═══════════════════════════════════════════════════════════════════════════════
# GUÍA DE INTERPRETACIÓN
# ═══════════════════════════════════════════════════════════════════════════════
#
# REGRESIÓN LINEAL (Arrears_calc)
# ─────────────────────────────────────────────────────────────────────────────
# • RMSE en días: error promedio del modelo en la predicción de días de atraso.
#   Un RMSE de 5 días significa que el modelo se equivoca en promedio ±5 días.
# • R² ≈ 0: indica que las variables disponibles explican poco la variabilidad
#   de arrears. Esto es esperado porque el 98% tiene arrears ≤ 0 (al día).
# • Coeficiente positivo en prom_mora_cliente → clientes con historial de mora
#   alto tienden a acumular más días de atraso.
# • DW ≈ 2 → residuos sin autocorrelación (modelo bien especificado).
# • BIC mínimo → equilibrio óptimo entre ajuste y complejidad del modelo.
#
# REGRESIÓN LOGÍSTICA (Pago_Oportuno_Bin)
# ─────────────────────────────────────────────────────────────────────────────
# • OR > 1: la variable aumenta la probabilidad de MORA.
# • OR < 1: la variable disminuye la probabilidad de mora (protector).
# • AUC > 0.7 = modelo útil; > 0.8 = muy bueno; > 0.9 = excelente.
# • El umbral óptimo (umbral_optimo) maximiza F1 en la clase Mora.
#   → Un umbral bajo captura más casos de mora (mayor Recall) a costa
#     de más falsos positivos (menor Precision).
#
# k-NN
# ─────────────────────────────────────────────────────────────────────────────
# • La normalización min-max es obligatoria: sin ella las variables de mayor
#   escala (log_monto) dominarían las distancias.
# • K pequeño → sobreajuste; K grande → subajuste. El K óptimo se elige
#   por mayor Kappa (métrica robusta al desbalance severo).
# • El modelo se entrena con SMOTE y se evalúa en test ORIGINAL.
#
# BALANCEO DE CLASES (desbalance 98,1% / 1,9%)
# ─────────────────────────────────────────────────────────────────────────────
# • Sin balanceo, el modelo predice casi siempre "Oportuno" y obtiene
#   Accuracy ≈ 98% pero Recall(Mora) ≈ 0% → completamente inútil para
#   detectar mora.
# • ROSE y SMOTE sintetizan casos de mora para que el modelo los aprenda.
# • Evaluar SIEMPRE en el test original (sin balancear) para medir el
#   rendimiento real en producción.
# • Kappa y F1 son las métricas correctas con desbalance severo.
#   Accuracy sola es engañosa.
#
# SELECCIÓN DEL MODELO
# ─────────────────────────────────────────────────────────────────────────────
# • Priorizar el modelo con mayor F1(Mora) y Kappa.
# • Si el negocio penaliza más los falsos negativos (mora no detectada),
#   bajar el umbral para maximizar Recall a costa de Precision.
# • Si el negocio penaliza los falsos positivos (alertas innecesarias),
#   subir el umbral para maximizar Precision.
# ═══════════════════════════════════════════════════════════════════════════════

# ═══════════════════════════════════════════════════════════════════════════════
#  SECCIÓN 11 — GRÁFICOS INDIVIDUALES + PANEL CONSOLIDADO (MEJORADO)
#  Ejecutar después de completar las Secciones 1–10 del script principal.
#  Genera cada gráfico por separado (legibles) y un panel final limpio.
# ═══════════════════════════════════════════════════════════════════════════════
#  REQUISITO: tener en el entorno los objetos creados en las secciones anteriores:
#    df_work, tab_comp, roc_df, roc_rose, auc_logit, auc_rose,
#    umbral_optimo, k_opt, k_vals, acc_knn, f1_knn_v, kap_knn,
#    rmse_te, r2_te, pred_lm_test, df_test, prob_rose, C1, C2, C3, C4, C5
# ═══════════════════════════════════════════════════════════════════════════════

library(ggplot2)
library(patchwork)
library(dplyr)
library(tidyr)
library(scales)

dir.create("output_modelado/graficos", recursive = TRUE, showWarnings = FALSE)

# ── Tema refinado para gráficos individuales ──────────────────────────────────
tema_plot <- theme_minimal(base_size = 13) +
  theme(
    plot.title        = element_text(face = "bold", size = 15, color = "#1B3A6B",
                                     margin = margin(b = 6)),
    plot.subtitle     = element_text(size = 11, color = "#555555",
                                     margin = margin(b = 10)),
    plot.caption      = element_text(size = 9, color = "#999999",
                                     margin = margin(t = 8)),
    plot.background   = element_rect(fill = "white", color = NA),
    panel.background  = element_rect(fill = "white", color = NA),
    panel.grid.major  = element_line(color = "#EEEEEE", linewidth = 0.5),
    panel.grid.minor  = element_blank(),
    axis.title        = element_text(size = 11, color = "#333333"),
    axis.text         = element_text(size = 10, color = "#444444"),
    legend.position   = "bottom",
    legend.title      = element_text(size = 10, face = "bold"),
    legend.text       = element_text(size = 10),
    legend.background = element_rect(fill = "white", color = NA),
    plot.margin       = margin(16, 16, 12, 16)
  )

C1 <- "#1B3A6B"; C2 <- "#E84040"; C3 <- "#F5A623"; C4 <- "#27AE60"; C5 <- "#8E44AD"

PALETA_BUCKET <- c(
  "Al dia"  = C4,   "1-30"    = "#5BA85A",
  "31-60"   = C3,   "61-90"   = "#D4891A",
  "91-180"  = "#C05010", "181-360" = C2, ">360" = "#7B0000"
)


# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 1 — Variable objetivo: Pago_Oportuno_Bin
# ═══════════════════════════════════════════════════════════════════════════════
tab_target <- df_work %>%
  count(pago_oportuno_bin) %>%
  mutate(
    pct   = round(n / sum(n) * 100, 1),
    label = paste0(comma(n), "\n(", pct, "%)")
  )

g1 <- ggplot(tab_target,
             aes(x = pago_oportuno_bin, y = n, fill = pago_oportuno_bin)) +
  geom_col(width = 0.55, alpha = 0.92, show.legend = FALSE) +
  geom_text(aes(label = label), vjust = -0.4,
            size = 4.5, fontface = "bold", color = "#222222") +
  scale_fill_manual(values = c("Mora" = C2, "Oportuno" = C4)) +
  scale_y_continuous(labels = comma,
                     expand = expansion(mult = c(0, 0.18))) +
  labs(
    title    = "Variable Objetivo: Pago_Oportuno_Bin",
    subtitle = "Desbalance severo — clave para estrategia de balanceo",
    x        = NULL,
    y        = "Número de registros",
    caption  = "Clase positiva de interés: Mora (0)"
  ) +
  tema_plot +
  theme(axis.text.x = element_text(size = 12, face = "bold"))

print(g1)

ggsave("output_modelado/graficos/G1_variable_objetivo.png",
       g1, width = 7, height = 5.5, dpi = 180, bg = "white")
cat("✓ G1 guardado\n")

## ✓ G1 guardado

# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 2 — Distribución Bucket_Mora
# ═══════════════════════════════════════════════════════════════════════════════
tab_bkt <- df_work %>%
  filter(!is.na(bucket_mora_calc)) %>%
  count(bucket_mora_calc) %>%
  mutate(
    pct          = round(n / sum(n) * 100, 2),
    bucket_mora_calc = factor(bucket_mora_calc,
      levels = c("Al dia","1-30","31-60","61-90","91-180","181-360",">360"))
  )

g2 <- ggplot(tab_bkt,
             aes(x = bucket_mora_calc, y = n, fill = bucket_mora_calc)) +
  geom_col(width = 0.7, alpha = 0.92, show.legend = FALSE) +
  geom_text(aes(label = paste0(pct, "%")),
            vjust = -0.5, size = 4, fontface = "bold", color = "#222222") +
  scale_fill_manual(values = PALETA_BUCKET) +
  scale_y_continuous(labels = comma,
                     expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Distribución por Bucket_Mora",
    subtitle = "Tramos de mora según días de atraso (Arrears_calc)",
    x        = "Tramo de mora",
    y        = "Registros",
    caption  = "Calculado desde Arrears_calc recalculado"
  ) +
  tema_plot

print(g2)

ggsave("output_modelado/graficos/G2_bucket_mora.png",
       g2, width = 8, height = 5.5, dpi = 180, bg = "white")
cat("✓ G2 guardado\n")

## ✓ G2 guardado

# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 3 — Curvas ROC superpuestas
# ═══════════════════════════════════════════════════════════════════════════════
roc_combinado <- bind_rows(
  roc_df   %>% select(fpr, tpr) %>% mutate(Modelo = "Logística original"),
  roc_rose %>% select(fpr, tpr) %>% mutate(Modelo = "Logística + ROSE")
)

g3 <- ggplot(roc_combinado, aes(x = fpr, y = tpr, color = Modelo)) +
  # Área bajo la curva (relleno suave)
  geom_ribbon(data = roc_df %>% mutate(Modelo = "Logística original"),
              aes(ymin = 0, ymax = tpr, x = fpr),
              fill = C1, alpha = 0.08, inherit.aes = FALSE) +
  geom_ribbon(data = roc_rose %>% mutate(Modelo = "Logística + ROSE"),
              aes(ymin = 0, ymax = tpr, x = fpr),
              fill = C4, alpha = 0.08, inherit.aes = FALSE) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed",
              color = "gray70", linewidth = 0.8) +
  geom_line(linewidth = 1.6) +
  # Etiquetas AUC con caja de fondo
  annotate("label", x = 0.60, y = 0.22,
           label = sprintf("AUC Original = %.4f", auc_logit),
           color = C1, fill = "white", size = 4, fontface = "bold",
           label.padding = unit(0.4, "lines"), label.r = unit(0.2, "lines")) +
  annotate("label", x = 0.60, y = 0.10,
           label = sprintf("AUC ROSE     = %.4f", auc_rose),
           color = C4, fill = "white", size = 4, fontface = "bold",
           label.padding = unit(0.4, "lines"), label.r = unit(0.2, "lines")) +
  scale_color_manual(values = c("Logística original" = C1,
                                 "Logística + ROSE"   = C4),
                     name = "Modelo") +
  scale_x_continuous(labels = percent, limits = c(0, 1),
                     breaks = seq(0, 1, 0.25)) +
  scale_y_continuous(labels = percent, limits = c(0, 1),
                     breaks = seq(0, 1, 0.25)) +
  labs(
    title    = "Curvas ROC — Comparación de Modelos",
    subtitle = "Clase positiva: Mora  |  Test original sin balancear",
    x        = "Tasa de Falsos Positivos (1 – Especificidad)",
    y        = "Tasa de Verdaderos Positivos (Sensibilidad)",
    caption  = "Línea diagonal = clasificador aleatorio (AUC = 0.5)"
  ) +
  tema_plot +
  theme(legend.position = c(0.72, 0.18),
        legend.background = element_rect(fill = "white", color = "#DDDDDD",
                                          linewidth = 0.4))

print(g3)

ggsave("output_modelado/graficos/G3_curvas_ROC.png",
       g3, width = 7.5, height = 6.5, dpi = 180, bg = "white")
cat("✓ G3 guardado\n")

## ✓ G3 guardado

# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 4 — k-NN: métricas por valor de K
# ═══════════════════════════════════════════════════════════════════════════════
df_knn_metricas <- data.frame(
  K       = rep(k_vals, 3),
  Valor   = c(acc_knn,
              replace(f1_knn_v,  is.na(f1_knn_v),  0),
              kap_knn),
  Metrica = rep(c("Accuracy", "F1 (Mora)", "Kappa"), each = length(k_vals))
)

g4 <- ggplot(df_knn_metricas,
             aes(x = K, y = Valor, color = Metrica, group = Metrica)) +
  geom_line(linewidth = 1.4) +
  geom_point(size = 4, stroke = 1.5, shape = 21,
             aes(fill = Metrica), color = "white") +
  geom_vline(xintercept = k_opt, linetype = "dashed",
             color = C3, linewidth = 1) +
  annotate("label", x = k_opt, y = max(acc_knn) * 1.01,
           label = paste0("K óptimo = ", k_opt),
           color = C3, fill = "white", size = 4, fontface = "bold",
           label.padding = unit(0.4,"lines"),
           label.r = unit(0.2,"lines")) +
  scale_color_manual(values = c("Accuracy" = C1, "F1 (Mora)" = C4, "Kappa" = C5),
                     name = "Métrica") +
  scale_fill_manual(values  = c("Accuracy" = C1, "F1 (Mora)" = C4, "Kappa" = C5),
                    guide = "none") +
  scale_x_continuous(breaks = k_vals) +
  scale_y_continuous(labels = percent,
                     breaks = seq(0, 1, 0.1),
                     limits = c(0, max(acc_knn) * 1.06)) +
  labs(
    title    = "k-NN: Métricas por Valor de K",
    subtitle = "Entrenado con SMOTE · Evaluado en test original",
    x        = "Número de vecinos (K)",
    y        = "Valor de la métrica",
    caption  = paste0("K óptimo seleccionado por mayor Kappa  |  ",
                      "Datos SMOTE (train) → Test original (sin balancear)")
  ) +
  tema_plot

print(g4)

ggsave("output_modelado/graficos/G4_kNN_metricas_K.png",
       g4, width = 8, height = 5.5, dpi = 180, bg = "white")
cat("✓ G4 guardado\n")

## ✓ G4 guardado

# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 5 — Regresión lineal: Real vs Predicho
# ═══════════════════════════════════════════════════════════════════════════════

# Construir dataframe alineado (excluir NAs del test)
df_lm_plot <- data.frame(
  real = df_test$arrears_calc,
  pred = pred_lm_test
) %>% drop_na()

# Muestra para rendimiento (máx 15.000 puntos)
set.seed(42)
df_lm_sample <- df_lm_plot %>%
  slice_sample(n = min(15000, nrow(df_lm_plot)))

# Límites simétricos p1-p99
lim_inf <- quantile(df_lm_plot$real, 0.01, na.rm = TRUE)
lim_sup <- quantile(df_lm_plot$real, 0.99, na.rm = TRUE)

g5 <- ggplot(df_lm_sample, aes(x = real, y = pred)) +
  geom_point(alpha = 0.12, size = 0.8, color = C1) +
  geom_abline(slope = 1, intercept = 0,
              color = C2, linetype = "dashed", linewidth = 1.2) +
  geom_smooth(method = "lm", color = C4, linewidth = 1,
              se = TRUE, alpha = 0.15, linetype = "solid") +
  coord_cartesian(xlim = c(lim_inf, lim_sup),
                  ylim = c(lim_inf, lim_sup)) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  annotate("label", x = lim_inf + (lim_sup - lim_inf) * 0.04,
           y = lim_sup * 0.95,
           label = sprintf("RMSE = %.4f días\nR²   = %.6f", rmse_te, r2_te),
           hjust = 0, size = 4, fontface = "bold", color = "#222222",
           fill = "white", label.padding = unit(0.5,"lines"),
           label.r = unit(0.2,"lines")) +
  labs(
    title    = "Regresión Lineal: Arrears Real vs Predicho",
    subtitle = "Muestra aleatoria p1–p99  |  Stepwise AIC",
    x        = "Arrears real (días de atraso)",
    y        = "Arrears predicho (días de atraso)",
    caption  = "Línea roja = predicción perfecta  |  Línea verde = tendencia real (OLS)"
  ) +
  tema_plot

print(g5)

## `geom_smooth()` using formula = 'y ~ x'

ggsave("output_modelado/graficos/G5_regresion_real_vs_predicho.png",
       g5, width = 7.5, height = 6.5, dpi = 180, bg = "white")

## `geom_smooth()` using formula = 'y ~ x'

cat("✓ G5 guardado\n")

## ✓ G5 guardado

# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 6 — Comparación de modelos de clasificación
# ═══════════════════════════════════════════════════════════════════════════════

# Tabla limpia: nombre corto para visualización
tab_comp_plot <- tab_comp %>%
  mutate(
    Modelo_corto = case_when(
      grepl("original.*0.50", Modelo)  ~ "Logística\noriginal\nu=0.50",
      grepl("ROSE.*0.50",     Modelo)  ~ "Logística\nROSE\nu=0.50",
      grepl("ROSE.*0\\.[0-9]", Modelo) ~ paste0("Logística\nROSE\nu=", umbral_optimo),
      grepl("k-NN",            Modelo) ~ paste0("k-NN\n(SMOTE)\nK=", k_opt),
      TRUE ~ Modelo
    )
  ) %>%
  select(Modelo_corto, Accuracy, Precision, Recall, F1, Kappa) %>%
  pivot_longer(-Modelo_corto, names_to = "Metrica", values_to = "Valor") %>%
  filter(!is.na(Valor)) %>%
  mutate(
    Metrica = factor(Metrica,
                     levels = c("Accuracy","Precision","Recall","F1","Kappa")),
    Modelo_corto = factor(Modelo_corto,
                          levels = unique(Modelo_corto))
  )

# Colores por modelo
colores_modelo <- setNames(
  c(C1, C4, C3, C5),
  levels(tab_comp_plot$Modelo_corto)
)

g6 <- ggplot(tab_comp_plot,
             aes(x = Metrica, y = Valor, fill = Modelo_corto)) +
  geom_col(position = position_dodge(width = 0.78),
           alpha = 0.90, width = 0.72) +
  geom_text(aes(label = sprintf("%.3f", Valor)),
            position = position_dodge(width = 0.78),
            vjust = -0.45, size = 3.2, fontface = "bold",
            color = "#222222") +
  geom_hline(yintercept = c(0.25, 0.50, 0.75),
             color = "#DDDDDD", linewidth = 0.4, linetype = "solid") +
  scale_fill_manual(values = colores_modelo, name = "Modelo") +
  scale_y_continuous(limits  = c(0, 1.12),
                     breaks  = seq(0, 1, 0.25),
                     labels  = percent_format(accuracy = 1)) +
  labs(
    title    = "Comparación de Modelos de Clasificación",
    subtitle = "Clase positiva: Mora  |  Evaluación en conjunto de prueba (test original)",
    x        = NULL,
    y        = "Valor de la métrica",
    caption  = paste0("Accuracy alta no implica buen modelo con desbalance severo.",
                      "\nPriorizar F1 y Kappa en clase Mora.")
  ) +
  tema_plot +
  theme(
    legend.position  = "bottom",
    legend.title     = element_text(size = 10, face = "bold"),
    legend.text      = element_text(size = 9),
    axis.text.x      = element_text(size = 11, face = "bold", color = "#1B3A6B")
  ) +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE))

print(g6)

ggsave("output_modelado/graficos/G6_comparacion_modelos.png",
       g6, width = 10, height = 6.5, dpi = 180, bg = "white")
cat("✓ G6 guardado\n")

## ✓ G6 guardado

# ═══════════════════════════════════════════════════════════════════════════════
# PANEL FINAL CONSOLIDADO — diseño limpio 3 × 2
# ═══════════════════════════════════════════════════════════════════════════════

# Tema más compacto para el panel
tema_panel <- theme_minimal(base_size = 10) +
  theme(
    plot.title        = element_text(face = "bold", size = 11, color = "#1B3A6B"),
    plot.subtitle     = element_text(size = 8.5, color = "#666666"),
    plot.background   = element_rect(fill = "white", color = NA),
    panel.background  = element_rect(fill = "white", color = NA),
    panel.grid.major  = element_line(color = "#F0F0F0", linewidth = 0.4),
    panel.grid.minor  = element_blank(),
    axis.title        = element_text(size = 9),
    axis.text         = element_text(size = 8.5),
    legend.position   = "bottom",
    legend.text       = element_text(size = 8),
    legend.title      = element_text(size = 8.5, face = "bold"),
    legend.key.size   = unit(0.45, "cm"),
    plot.margin       = margin(10, 10, 8, 10)
  )

# Versiones compactas de cada gráfico ─────────────────────────────────────────

# P1 — Objetivo
p1 <- ggplot(tab_target,
             aes(x = pago_oportuno_bin, y = n, fill = pago_oportuno_bin)) +
  geom_col(width = 0.5, alpha = 0.9, show.legend = FALSE) +
  geom_text(aes(label = paste0("(", pct, "%)")),
            vjust = -0.4, size = 3.5, fontface = "bold") +
  scale_fill_manual(values = c("Mora" = C2, "Oportuno" = C4)) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.18))) +
  labs(title = "Variable Objetivo", subtitle = "Desbalance: 98,1% vs 1,9%",
       x = NULL, y = "Registros") +
  tema_panel

# P2 — Bucket
p2 <- ggplot(tab_bkt, aes(x = bucket_mora_calc, y = n, fill = bucket_mora_calc)) +
  geom_col(width = 0.7, alpha = 0.9, show.legend = FALSE) +
  geom_text(aes(label = paste0(pct, "%")), vjust = -0.4, size = 3, fontface = "bold") +
  scale_fill_manual(values = PALETA_BUCKET) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.18))) +
  labs(title = "Distribución Bucket_Mora", subtitle = "Tramos de mora por días de atraso",
       x = NULL, y = "Registros") +
  tema_panel +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))

# P3 — ROC
p3 <- ggplot(roc_combinado, aes(x = fpr, y = tpr, color = Modelo)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed",
              color = "gray75", linewidth = 0.7) +
  geom_line(linewidth = 1.3) +
  annotate("text", x = 0.55, y = 0.18,
           label = sprintf("AUC orig: %.4f\nAUC ROSE: %.4f", auc_logit, auc_rose),
           size = 3.2, fontface = "bold", color = "#333333") +
  scale_color_manual(values = c("Logística original" = C1,
                                 "Logística + ROSE"   = C4), name = NULL) +
  scale_x_continuous(labels = percent, breaks = c(0, 0.5, 1)) +
  scale_y_continuous(labels = percent, breaks = c(0, 0.5, 1)) +
  labs(title = "Curvas ROC", subtitle = "Clase positiva: Mora",
       x = "FPR", y = "TPR") +
  tema_panel +
  theme(legend.position = c(0.65, 0.15))

# P4 — kNN
p4 <- ggplot(df_knn_metricas,
             aes(x = K, y = Valor, color = Metrica, group = Metrica)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  geom_vline(xintercept = k_opt, linetype = "dashed", color = C3, linewidth = 0.9) +
  annotate("text", x = k_opt + 0.5, y = min(kap_knn),
           label = paste0("K=", k_opt), color = C3, size = 3.5, fontface = "bold") +
  scale_color_manual(values = c("Accuracy" = C1, "F1 (Mora)" = C4, "Kappa" = C5),
                     name = NULL) +
  scale_x_continuous(breaks = k_vals) +
  scale_y_continuous(labels = percent) +
  labs(title = paste0("k-NN: Métricas vs K  (K óptimo = ", k_opt, ")"),
       subtitle = "Train: SMOTE · Test: original",
       x = "K vecinos", y = "Valor") +
  tema_panel

# P5 — Regresión real vs predicho
p5 <- ggplot(df_lm_sample %>% filter(!is.na(real), !is.na(pred)),
             aes(x = real, y = pred)) +
  geom_point(alpha = 0.10, size = 0.6, color = C1) +
  geom_abline(slope = 1, intercept = 0, color = C2,
              linetype = "dashed", linewidth = 1) +
  geom_smooth(method = "lm", color = C4, linewidth = 0.9,
              se = FALSE) +
  coord_cartesian(xlim = c(lim_inf, lim_sup),
                  ylim = c(lim_inf, lim_sup)) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  annotate("label", x = lim_inf + (lim_sup-lim_inf)*0.03,
           y = lim_sup * 0.94,
           label = sprintf("RMSE=%.2f\nR²=%.4f", rmse_te, r2_te),
           hjust = 0, size = 3.2, fill = "white",
           label.padding = unit(0.3,"lines")) +
  labs(title = "Regresión Lineal: Real vs Predicho",
       subtitle = "Arrears_calc (días de atraso)",
       x = "Real", y = "Predicho") +
  tema_panel

# P6 — Comparación modelos (versión compacta)
tab_comp_compact <- tab_comp_plot %>%
  filter(Metrica %in% c("Accuracy","F1","Kappa"))

colores_panel <- setNames(
  c(C1, C4, C3, C5),
  levels(tab_comp_plot$Modelo_corto)
)

p6 <- ggplot(tab_comp_compact,
             aes(x = Metrica, y = Valor, fill = Modelo_corto)) +
  geom_col(position = position_dodge(width = 0.75),
           alpha = 0.88, width = 0.70) +
  geom_text(aes(label = sprintf("%.2f", Valor)),
            position = position_dodge(width = 0.75),
            vjust = -0.4, size = 2.8, fontface = "bold") +
  scale_fill_manual(values = colores_panel, name = NULL) +
  scale_y_continuous(limits = c(0, 1.18),
                     labels = percent_format(accuracy = 1)) +
  labs(title    = "Comparación de Modelos",
       subtitle = "Evaluación en test original",
       x = NULL, y = "Valor") +
  tema_panel +
  theme(
    axis.text.x  = element_text(size = 9, face = "bold"),
    legend.position = "bottom"
  ) +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE,
                              override.aes = list(size = 0.5)))

# ── Ensamblar panel final ──────────────────────────────────────────────────────
panel_final <- (p1 | p2 | p3) / (p4 | p5 | p6) +
  plot_annotation(
    title    = "Panel Consolidado — Paso 4: Modelado Predictivo I2C",
    subtitle = paste0(
      formatC(nrow(df_work), big.mark = ","),
      " registros RV  ·  265 clientes  ·  Logística + k-NN + Regresión Lineal  ·  ",
      format(Sys.Date(), "%d/%m/%Y")
    ),
    caption  = "Fuente: Base_Modelado_I2C_Lista_1.xlsx  |  Solo facturas RV  |  CRISP-DM Paso 4",
    theme = theme(
      plot.title      = element_text(face = "bold", size = 16, color = "#1B3A6B",
                                      margin = margin(b = 4)),
      plot.subtitle   = element_text(size = 11, color = "#555555",
                                      margin = margin(b = 8)),
      plot.caption    = element_text(size = 9, color = "#999999",
                                      margin = margin(t = 8)),
      plot.background = element_rect(fill = "white", color = NA),
      plot.margin     = margin(16, 16, 12, 16)
    )
  )

print(panel_final)

## `geom_smooth()` using formula = 'y ~ x'

ggsave("output_modelado/graficos/Panel_Consolidado_Final.png",
       panel_final,
       width  = 22,
       height = 14,
       dpi    = 180,
       bg     = "white")

## `geom_smooth()` using formula = 'y ~ x'

cat("\n✓ Panel consolidado exportado: output_modelado/graficos/Panel_Consolidado_Final.png\n")

## 
## ✓ Panel consolidado exportado: output_modelado/graficos/Panel_Consolidado_Final.png

cat("\n── Gráficos individuales exportados ──────────────────\n")

## 
## ── Gráficos individuales exportados ──────────────────

cat("  G1: Variable objetivo (Pago_Oportuno_Bin)\n")

##   G1: Variable objetivo (Pago_Oportuno_Bin)

cat("  G2: Distribución Bucket_Mora\n")

##   G2: Distribución Bucket_Mora

cat("  G3: Curvas ROC superpuestas\n")

##   G3: Curvas ROC superpuestas

cat("  G4: k-NN métricas por K\n")

##   G4: k-NN métricas por K

cat("  G5: Regresión lineal real vs predicho\n")

##   G5: Regresión lineal real vs predicho

cat("  G6: Comparación de 4 modelos\n")

##   G6: Comparación de 4 modelos

cat("  → Carpeta: output_modelado/graficos/\n\n")

##   → Carpeta: output_modelado/graficos/

cat("✅  Sección 11 completada\n")

## ✅  Sección 11 completada

MODELADO

2026-04-11