# ═══════════════════════════════════════════════════════════════════════════════
# PASO 4 — MODELADO PREDICTIVO | BASE I2C — CARTERA
# Proyecto : Analítica Predictiva — Metodología CRISP-DM
# Archivo : Base_Modelado_I2C_Lista_1.xlsx
# 141.669 registros · 33 variables · Solo facturas RV
#
# Distribución del target:
# · Pago_Oportuno_Bin = 1 (oportuno) : 138.919 (98,1%)
# · Pago_Oportuno_Bin = 0 (mora) : 2.742 ( 1,9%)
# → Desbalance severo: estrategia ROSE + SMOTE obligatoria
#
# Variables objetivo disponibles:
# · Pago_Oportuno_Bin → Clasificación binaria (0 = mora, 1 = oportuno)
# · Arrears_calc → Regresión continua (días de atraso)
# · Bucket_Mora_calc → Clasificación multiclase (tramo de mora)
#
# ESTRUCTURA DEL SCRIPT
# ─────────────────────
# SEC 0 ▸ Librerías
# SEC 1 ▸ Carga y validación de datos
# SEC 2 ▸ Análisis exploratorio (EDA confirmatorio)
# SEC 3 ▸ Preparación del dataset de modelado
# SEC 4 ▸ División train / test
# SEC 5 ▸ MODELO 1 — Regresión lineal (Arrears_calc)
# SEC 6 ▸ MODELO 2 — Regresión logística (Pago_Oportuno_Bin)
# SEC 7 ▸ Balanceo de clases (ROSE y SMOTE)
# SEC 8 ▸ MODELO 3 — Regresión logística con datos balanceados
# SEC 9 ▸ MODELO 4 — k-NN con datos balanceados
# SEC 10 ▸ Comparación y selección de modelos
# SEC 11 ▸ Gráficos consolidados
# SEC 12 ▸ Exportación de resultados
# ═══════════════════════════════════════════════════════════════════════════════
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 0 ▸ LIBRERÍAS
# ───────────────────────────────────────────────────────────────────────────────
paquetes <- c(
"readxl", # Carga del Excel
"tidyverse", # Manipulación de datos y ggplot2
"janitor", # Limpieza de nombres
"psych", # describe() — estadísticas descriptivas
"corrplot", # Visualización de correlaciones
"lmtest", # dwtest() — diagnóstico de regresión
"leaps", # regsubsets() — selección de variables
"ROSE", # ovun.sample() — balanceo ROSE
"smotefamily", # SMOTE() — síntesis de datos
"class", # knn() — k vecinos más cercanos
"caret", # confusionMatrix() — evaluación de modelos
"patchwork", # Paneles de gráficos
"scales" # Formateo de ejes
)
for (pkg in paquetes) {
if (!requireNamespace(pkg, quietly = TRUE))
install.packages(pkg, repos = "https://cran.r-project.org")
suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}
## Warning: package 'readxl' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'stringr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'janitor' was built under R version 4.5.3
## Warning: package 'psych' was built under R version 4.5.3
## Warning: package 'smotefamily' was built under R version 4.5.3
## Warning: package 'patchwork' was built under R version 4.5.3
## Warning: package 'scales' was built under R version 4.5.3
cat("✓ Librerías cargadas\n\n")
## ✓ Librerías cargadas
# ── Paleta y tema visual ──────────────────────────────────────────────────────
C1 <- "#1B3A6B" # azul — principal
C2 <- "#E84040" # rojo — mora / alerta
C3 <- "#F5A623" # naranja — advertencia
C4 <- "#27AE60" # verde — oportuno / positivo
C5 <- "#8E44AD" # morado — acento
PALETA_BUCKET <- c(
"Al dia" = C4, "1-30" = "#85C17E",
"31-60" = C3, "61-90" = "#E8A323",
"91-180" = "#D4691E", "181-360" = C2, ">360" = "#7B0000"
)
tema_base <- theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face = "bold", size = 13, color = C1),
plot.subtitle = element_text(size = 10, color = "gray40"),
plot.caption = element_text(size = 8, color = "gray55"),
axis.title = element_text(size = 10),
legend.position = "bottom",
panel.grid.minor = element_blank()
)
set.seed(42) # Semilla global para reproducibilidad
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 1 ▸ CARGA Y VALIDACIÓN DE DATOS
# ───────────────────────────────────────────────────────────────────────────────
cat("═══════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 1 — CARGA Y VALIDACIÓN\n")
## SECCIÓN 1 — CARGA Y VALIDACIÓN
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# ── Ajusta la ruta si el archivo está en otro directorio ─────────────────────
RUTA <- "C:/Users/jcabia01/Downloads/Base_Modelado_I2C_Lista_1.xlsx"
cat("Cargando archivo...\n")
## Cargando archivo...
df_raw <- read_excel(RUTA, sheet = "Modelado_Final", guess_max = 5000)
cat("✓ Cargado:", formatC(nrow(df_raw), big.mark = ","),
"filas ×", ncol(df_raw), "columnas\n\n")
## ✓ Cargado: 141,669 filas × 33 columnas
# Vista estructural
str(df_raw)
## tibble [141,669 × 33] (S3: tbl_df/tbl/data.frame)
## $ Anon_Customer_ID : chr [1:141669] "CUST_139" "CUST_139" "CUST_158" "CUST_158" ...
## $ Anon_Document_ID : chr [1:141669] "XXXXXX7262" "XXXXXX7465" "XXXXXX2943" "XXXXXX4030" ...
## $ Terms_of_Payment : chr [1:141669] "Z540" "Z540" "Z914" "Z914" ...
## $ Document_Type : chr [1:141669] "RV" "RV" "RV" "RV" ...
## $ Document_Date : POSIXct[1:141669], format: "2026-03-20" "2026-03-24" ...
## $ Payment_date : POSIXct[1:141669], format: "2026-05-24" "2026-05-25" ...
## $ Net_due_date : POSIXct[1:141669], format: "2026-05-24" "2026-05-25" ...
## $ Clearing_date : POSIXct[1:141669], format: NA NA ...
## $ Arrears_after_net_due_date: num [1:141669] -56 -57 -19 -26 -32 -32 -44 -20 -21 -21 ...
## $ Amount_in_local_currency : num [1:141669] 3.73e+07 1.83e+07 1.00e+08 1.29e+08 3.32e+07 ...
## $ Reason_code : chr [1:141669] "Sin_Codigo" "Sin_Codigo" "Sin_Codigo" "Sin_Codigo" ...
## $ Year/month : chr [1:141669] "2026/03" "2026/03" "2026/03" "2026/03" ...
## $ Estado_Cartera : chr [1:141669] "Abierta" "Abierta" "Abierta" "Abierta" ...
## $ Arrears_calc : num [1:141669] -44 -45 -7 -14 -20 -20 -32 -8 -9 -9 ...
## $ Pago_Oportuno_Bin : num [1:141669] 1 1 1 1 1 1 1 1 1 1 ...
## $ Bucket_Mora_calc : chr [1:141669] "Al dia" "Al dia" "Al dia" "Al dia" ...
## $ Prom_Mora_Cliente : num [1:141669] -0.43 -0.43 -15.17 -15.17 -15.17 ...
## $ Max_Mora_Cliente : num [1:141669] 0 0 -7 -7 -7 -7 -7 -8 -8 -8 ...
## $ Min_Mora_Cliente : num [1:141669] -45 -45 -32 -32 -32 -32 -32 -31 -31 -31 ...
## $ Desv_Mora_Cliente : num [1:141669] 4.35 4.35 1.95 1.95 1.95 1.95 1.95 1.86 1.86 1.86 ...
## $ Freq_Mora_Cliente : num [1:141669] 0 0 0 0 0 0 0 0 0 0 ...
## $ N_Facturas_Cliente : num [1:141669] 416 416 212 212 212 ...
## $ Mes_Documento : num [1:141669] 3 3 2 3 3 3 3 3 3 3 ...
## $ Trimestre_Documento : num [1:141669] 1 1 1 1 1 1 1 1 1 1 ...
## $ Anio_Documento : num [1:141669] 2026 2026 2026 2026 2026 ...
## $ Dias_Hasta_Vencimiento : num [1:141669] 65 62 49 48 47 47 47 47 48 48 ...
## $ Antiguedad_Documento : num [1:141669] 21 17 42 34 27 27 15 39 39 39 ...
## $ Monto_Absoluto : num [1:141669] 3.73e+07 1.83e+07 1.00e+08 1.29e+08 3.32e+07 ...
## $ Log_Monto : num [1:141669] 17.4 16.7 18.4 18.7 17.3 ...
## $ Terms_Grupo : chr [1:141669] "Z540" "Z540" "Z914" "Z914" ...
## $ Estado_Num : num [1:141669] 1 1 1 1 1 1 1 1 1 1 ...
## $ flag_fecha_inconsistente : num [1:141669] 0 0 0 0 0 0 0 0 0 0 ...
## $ flag_amount_negativo : num [1:141669] 0 0 0 0 0 0 0 0 0 0 ...
cat("\n")
print(summary(df_raw))
## Anon_Customer_ID Anon_Document_ID Terms_of_Payment Document_Type
## Length:141669 Length:141669 Length:141669 Length:141669
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Document_Date Payment_date
## Min. :2022-01-03 00:00:00 Min. :2022-01-18 00:00:00
## 1st Qu.:2022-12-19 00:00:00 1st Qu.:2023-01-27 00:00:00
## Median :2024-01-25 00:00:00 Median :2024-03-02 00:00:00
## Mean :2024-01-30 13:02:56 Mean :2024-03-08 16:52:00
## 3rd Qu.:2025-03-10 00:00:00 3rd Qu.:2025-04-17 00:00:00
## Max. :2026-03-26 00:00:00 Max. :2026-06-25 00:00:00
##
## Net_due_date Clearing_date
## Min. :2022-01-18 00:00:00 Min. :2022-01-04 00:00:00
## 1st Qu.:2023-02-03 00:00:00 1st Qu.:2023-01-20 00:00:00
## Median :2024-03-09 00:00:00 Median :2024-01-25 00:00:00
## Mean :2024-03-15 08:56:22 Mean :2024-02-06 15:38:05
## 3rd Qu.:2025-04-28 00:00:00 3rd Qu.:2025-02-17 00:00:00
## Max. :2026-06-25 00:00:00 Max. :2026-03-30 00:00:00
## NA's :7320
## Arrears_after_net_due_date Amount_in_local_currency Reason_code
## Min. :-382.000 Min. : -80962384 Length:141669
## 1st Qu.: -10.000 1st Qu.: 1514037 Class :character
## Median : 4.000 Median : 6131832 Mode :character
## Mean : 2.728 Mean : 23118025
## 3rd Qu.: 14.000 3rd Qu.: 20395610
## Max. : 577.000 Max. :1611343446
##
## Year/month Estado_Cartera Arrears_calc Pago_Oportuno_Bin
## Length:141669 Length:141669 Min. :-76.000 Min. :0.0000
## Class :character Class :character 1st Qu.:-15.000 1st Qu.:1.0000
## Mode :character Mode :character Median : 0.000 Median :1.0000
## Mean : -6.478 Mean :0.9806
## 3rd Qu.: 0.000 3rd Qu.:1.0000
## Max. :586.000 Max. :1.0000
## NA's :8 NA's :8
## Bucket_Mora_calc Prom_Mora_Cliente Max_Mora_Cliente Min_Mora_Cliente
## Length:141669 Min. :-23.000 Min. :-15.00 Min. :-76.00
## Class :character 1st Qu.:-14.970 1st Qu.: 0.00 1st Qu.:-32.00
## Mode :character Median : -0.630 Median : 26.00 Median :-28.00
## Mean : -6.479 Mean : 58.51 Mean :-25.31
## 3rd Qu.: 0.150 3rd Qu.:134.00 3rd Qu.:-21.00
## Max. : 19.650 Max. :586.00 Max. : 0.00
##
## Desv_Mora_Cliente Freq_Mora_Cliente N_Facturas_Cliente Mes_Documento
## Min. : 0.000 Min. :0.00000 Min. : 1 Min. : 1.000
## 1st Qu.: 2.110 1st Qu.:0.00000 1st Qu.: 941 1st Qu.: 3.000
## Median : 2.260 Median :0.01710 Median : 4044 Median : 6.000
## Mean : 4.007 Mean :0.01936 Mean :16159 Mean : 5.951
## 3rd Qu.: 3.480 3rd Qu.:0.03480 3rd Qu.:44940 3rd Qu.: 9.000
## Max. :103.360 Max. :0.44440 Max. :44940 Max. :12.000
## NA's :6
## Trimestre_Documento Anio_Documento Dias_Hasta_Vencimiento Antiguedad_Documento
## Min. :1.000 Min. :2022 Min. :-272.00 Min. : 15.0
## 1st Qu.:1.000 1st Qu.:2022 1st Qu.: 38.00 1st Qu.: 396.0
## Median :2.000 Median :2024 Median : 47.00 Median : 806.0
## Mean :2.321 Mean :2024 Mean : 44.83 Mean : 800.5
## 3rd Qu.:3.000 3rd Qu.:2025 3rd Qu.: 49.00 3rd Qu.:1208.0
## Max. :4.000 Max. :2026 Max. : 417.00 Max. :1558.0
##
## Monto_Absoluto Log_Monto Terms_Grupo Estado_Num
## Min. :5.560e+02 Min. : 6.323 Length:141669 Min. :0.00000
## 1st Qu.:1.515e+06 1st Qu.:14.231 Class :character 1st Qu.:0.00000
## Median :6.134e+06 Median :15.629 Mode :character Median :0.00000
## Mean :2.312e+07 Mean :15.517 Mean :0.05151
## 3rd Qu.:2.040e+07 3rd Qu.:16.831 3rd Qu.:0.00000
## Max. :1.611e+09 Max. :21.200 Max. :1.00000
##
## flag_fecha_inconsistente flag_amount_negativo
## Min. :0.000e+00 Min. :0.0000000
## 1st Qu.:0.000e+00 1st Qu.:0.0000000
## Median :0.000e+00 Median :0.0000000
## Mean :5.647e-05 Mean :0.0001623
## 3rd Qu.:0.000e+00 3rd Qu.:0.0000000
## Max. :1.000e+00 Max. :1.0000000
##
# ── Limpiar nombres de columnas ───────────────────────────────────────────────
df <- df_raw %>% clean_names()
cat("\nNombres de columnas limpios:\n")
##
## Nombres de columnas limpios:
print(names(df))
## [1] "anon_customer_id" "anon_document_id"
## [3] "terms_of_payment" "document_type"
## [5] "document_date" "payment_date"
## [7] "net_due_date" "clearing_date"
## [9] "arrears_after_net_due_date" "amount_in_local_currency"
## [11] "reason_code" "year_month"
## [13] "estado_cartera" "arrears_calc"
## [15] "pago_oportuno_bin" "bucket_mora_calc"
## [17] "prom_mora_cliente" "max_mora_cliente"
## [19] "min_mora_cliente" "desv_mora_cliente"
## [21] "freq_mora_cliente" "n_facturas_cliente"
## [23] "mes_documento" "trimestre_documento"
## [25] "anio_documento" "dias_hasta_vencimiento"
## [27] "antiguedad_documento" "monto_absoluto"
## [29] "log_monto" "terms_grupo"
## [31] "estado_num" "flag_fecha_inconsistente"
## [33] "flag_amount_negativo"
# ── Conversión de tipos ───────────────────────────────────────────────────────
df <- df %>%
mutate(
# Fechas
document_date = as.Date(document_date),
payment_date = as.Date(payment_date),
net_due_date = as.Date(net_due_date),
clearing_date = as.Date(clearing_date),
# Numéricas continuas (ya deberían ser numéricas, forzar por seguridad)
arrears_calc = as.numeric(arrears_calc),
amount_in_local_currency = as.numeric(amount_in_local_currency),
log_monto = as.numeric(log_monto),
monto_absoluto = as.numeric(monto_absoluto),
prom_mora_cliente = as.numeric(prom_mora_cliente),
max_mora_cliente = as.numeric(max_mora_cliente),
min_mora_cliente = as.numeric(min_mora_cliente),
desv_mora_cliente = as.numeric(desv_mora_cliente),
freq_mora_cliente = as.numeric(freq_mora_cliente),
dias_hasta_vencimiento = as.numeric(dias_hasta_vencimiento),
antiguedad_documento = as.numeric(antiguedad_documento),
# Enteras
n_facturas_cliente = as.integer(n_facturas_cliente),
mes_documento = as.integer(mes_documento),
trimestre_documento = as.integer(trimestre_documento),
anio_documento = as.integer(anio_documento),
estado_num = as.integer(estado_num),
flag_fecha_inconsistente = as.integer(flag_fecha_inconsistente),
flag_amount_negativo = as.integer(flag_amount_negativo),
# Factores (categóricas)
terms_of_payment = as.factor(terms_of_payment),
terms_grupo = as.factor(terms_grupo),
estado_cartera = as.factor(estado_cartera),
bucket_mora_calc = factor(bucket_mora_calc,
levels = c("Al dia","1-30","31-60","61-90","91-180","181-360",">360"),
ordered = TRUE),
# Variable objetivo binaria — FACTOR con niveles explícitos
pago_oportuno_bin = factor(pago_oportuno_bin,
levels = c(0, 1),
labels = c("Mora","Oportuno"))
)
cat("\n✓ Tipos de datos asignados\n")
##
## ✓ Tipos de datos asignados
cat(" Clientes únicos :", n_distinct(df$anon_customer_id), "\n")
## Clientes únicos : 265
cat(" Documentos únicos:", n_distinct(df$anon_document_id), "\n")
## Documentos únicos: 10000
cat(" Flags (fecha) :", sum(df$flag_fecha_inconsistente, na.rm = TRUE), "\n")
## Flags (fecha) : 8
cat(" Flags (monto neg):", sum(df$flag_amount_negativo, na.rm = TRUE), "\n\n")
## Flags (monto neg): 23
# Distribución del target
cat("── Distribución Pago_Oportuno_Bin ──\n")
## ── Distribución Pago_Oportuno_Bin ──
print(table(df$pago_oportuno_bin))
##
## Mora Oportuno
## 2742 138919
print(prop.table(table(df$pago_oportuno_bin)))
##
## Mora Oportuno
## 0.01935607 0.98064393
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 2 ▸ ANÁLISIS EXPLORATORIO (EDA CONFIRMATORIO)
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 2 — EDA CONFIRMATORIO\n")
## SECCIÓN 2 — EDA CONFIRMATORIO
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# ── 2.1 Estadísticas descriptivas completas (psych) ──────────────────────────
cat("── Estadísticas descriptivas (psych::describe) ──\n")
## ── Estadísticas descriptivas (psych::describe) ──
vars_desc <- df %>%
select(arrears_calc, amount_in_local_currency, log_monto,
prom_mora_cliente, max_mora_cliente, freq_mora_cliente,
dias_hasta_vencimiento, antiguedad_documento) %>%
as.data.frame()
print(describe(vars_desc))
## vars n mean sd median
## arrears_calc 1 141661 -6.48 12.72 0.00
## amount_in_local_currency 2 141669 23118024.97 59330124.74 6131832.00
## log_monto 3 141669 15.52 1.81 15.63
## prom_mora_cliente 4 141669 -6.48 7.40 -0.63
## max_mora_cliente 5 141669 58.51 84.03 26.00
## freq_mora_cliente 6 141669 0.02 0.02 0.02
## dias_hasta_vencimiento 7 141669 44.83 8.19 47.00
## antiguedad_documento 8 141669 800.46 461.30 806.00
## trimmed mad min max
## arrears_calc -6.50 0.00 -76.00 5.860000e+02
## amount_in_local_currency 11042210.09 8167690.84 -80962383.99 1.611343e+09
## log_monto 15.54 1.91 6.32 2.120000e+01
## prom_mora_cliente -6.42 1.29 -23.00 1.965000e+01
## max_mora_cliente 50.39 47.44 -15.00 5.860000e+02
## freq_mora_cliente 0.02 0.03 0.00 4.400000e-01
## dias_hasta_vencimiento 44.60 7.41 -272.00 4.170000e+02
## antiguedad_documento 804.92 601.94 15.00 1.558000e+03
## range skew kurtosis se
## arrears_calc 6.620000e+02 26.67 1177.26 0.03
## amount_in_local_currency 1.692306e+09 7.46 79.04 157629.63
## log_monto 1.488000e+01 -0.10 -0.48 0.00
## prom_mora_cliente 4.265000e+01 0.02 -1.23 0.02
## max_mora_cliente 6.010000e+02 2.94 14.74 0.22
## freq_mora_cliente 4.400000e-01 2.77 37.59 0.00
## dias_hasta_vencimiento 6.890000e+02 1.56 94.56 0.02
## antiguedad_documento 1.543000e+03 -0.06 -1.22 1.23
# ── 2.2 Frecuencias de categóricas ───────────────────────────────────────────
cat("\n── Frecuencias — Terms_of_Payment ──\n")
##
## ── Frecuencias — Terms_of_Payment ──
print(table(df$terms_of_payment))
##
## B030 B045 P030 Z000 Z014 Z040 Z045 Z090 Z502 Z505 Z516 Z521 Z522
## 9 63 91 8 3 269 3 73 38 191 15 9685 44944
## Z525 Z526 Z540 Z668 Z672 Z691 Z827 Z913 Z914
## 1131 14591 3887 36 644 195 48 8263 57482
print(prop.table(table(df$terms_of_payment)))
##
## B030 B045 P030 Z000 Z014 Z040
## 6.352837e-05 4.446986e-04 6.423424e-04 5.646966e-05 2.117612e-05 1.898792e-03
## Z045 Z090 Z502 Z505 Z516 Z521
## 2.117612e-05 5.152856e-04 2.682309e-04 1.348213e-03 1.058806e-04 6.836358e-02
## Z522 Z525 Z526 Z540 Z668 Z672
## 3.172465e-01 7.983398e-03 1.029936e-01 2.743720e-02 2.541135e-04 4.545807e-03
## Z691 Z827 Z913 Z914
## 1.376448e-03 3.388179e-04 5.832610e-02 4.057486e-01
cat("\n── Frecuencias — Estado_Cartera ──\n")
##
## ── Frecuencias — Estado_Cartera ──
print(table(df$estado_cartera))
##
## Abierta Cerrada
## 7298 134371
cat("\n── Frecuencias — Bucket_Mora ──\n")
##
## ── Frecuencias — Bucket_Mora ──
print(table(df$bucket_mora_calc))
##
## Al dia 1-30 31-60 61-90 91-180 181-360 >360
## 138919 2609 77 3 4 6 43
print(prop.table(table(df$bucket_mora_calc)))
##
## Al dia 1-30 31-60 61-90 91-180 181-360
## 9.806439e-01 1.841721e-02 5.435512e-04 2.117732e-05 2.823642e-05 4.235464e-05
## >360
## 3.035416e-04
cat("\n── Frecuencias — Terms_Grupo ──\n")
##
## ── Frecuencias — Terms_Grupo ──
print(table(df$terms_grupo))
##
## Otros Z521 Z522 Z526 Z540 Z913 Z914
## 2817 9685 44944 14591 3887 8263 57482
# ── 2.3 Matriz de correlación ────────────────────────────────────────────────
cat("\n── Matriz de correlación ──\n")
##
## ── Matriz de correlación ──
vars_cor <- df %>%
select(arrears_calc, log_monto, prom_mora_cliente, max_mora_cliente,
freq_mora_cliente, desv_mora_cliente, dias_hasta_vencimiento,
antiguedad_documento, mes_documento, estado_num) %>%
drop_na() %>%
as.data.frame()
mat_cor <- cor(vars_cor)
print(round(mat_cor, 3))
## arrears_calc log_monto prom_mora_cliente
## arrears_calc 1.000 -0.080 0.582
## log_monto -0.080 1.000 -0.142
## prom_mora_cliente 0.582 -0.142 1.000
## max_mora_cliente 0.391 -0.177 0.672
## freq_mora_cliente 0.289 -0.187 0.496
## desv_mora_cliente 0.174 0.139 0.300
## dias_hasta_vencimiento -0.219 0.072 -0.369
## antiguedad_documento 0.015 -0.090 0.046
## mes_documento -0.013 0.015 -0.022
## estado_num 0.071 0.033 0.009
## max_mora_cliente freq_mora_cliente desv_mora_cliente
## arrears_calc 0.391 0.289 0.174
## log_monto -0.177 -0.187 0.139
## prom_mora_cliente 0.672 0.496 0.300
## max_mora_cliente 1.000 0.508 0.672
## freq_mora_cliente 0.508 1.000 0.132
## desv_mora_cliente 0.672 0.132 1.000
## dias_hasta_vencimiento -0.318 -0.247 -0.049
## antiguedad_documento 0.055 -0.063 0.029
## mes_documento -0.013 -0.022 0.008
## estado_num -0.001 0.073 0.008
## dias_hasta_vencimiento antiguedad_documento
## arrears_calc -0.219 0.015
## log_monto 0.072 -0.090
## prom_mora_cliente -0.369 0.046
## max_mora_cliente -0.318 0.055
## freq_mora_cliente -0.247 -0.063
## desv_mora_cliente -0.049 0.029
## dias_hasta_vencimiento 1.000 -0.163
## antiguedad_documento -0.163 1.000
## mes_documento 0.055 -0.066
## estado_num 0.049 -0.382
## mes_documento estado_num
## arrears_calc -0.013 0.071
## log_monto 0.015 0.033
## prom_mora_cliente -0.022 0.009
## max_mora_cliente -0.013 -0.001
## freq_mora_cliente -0.022 0.073
## desv_mora_cliente 0.008 0.008
## dias_hasta_vencimiento 0.055 0.049
## antiguedad_documento -0.066 -0.382
## mes_documento 1.000 -0.233
## estado_num -0.233 1.000
corrplot(mat_cor,
method = "color",
type = "lower",
addCoef.col = "black",
number.cex = 0.75,
tl.cex = 0.85,
tl.col = C1,
col = colorRampPalette(c(C2, "white", C1))(200),
title = "Correlación — Variables de Modelado",
mar = c(0, 0, 2, 0)
)

# ── 2.4 Gráficos exploratorios ───────────────────────────────────────────────
# Distribución de Bucket_Mora
p_bkt <- df %>%
filter(!is.na(bucket_mora_calc)) %>%
count(bucket_mora_calc) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
ggplot(aes(x = bucket_mora_calc, y = n, fill = bucket_mora_calc)) +
geom_col(alpha = 0.9, show.legend = FALSE) +
geom_text(aes(label = paste0(pct, "%")), vjust = -0.4, size = 3.5, fontface = "bold") +
scale_fill_manual(values = PALETA_BUCKET) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.12))) +
labs(title = "Distribución por Bucket_Mora", x = NULL, y = "Registros") +
tema_base
# Target binario
tab_target <- df %>%
count(pago_oportuno_bin) %>%
mutate(pct = round(n / sum(n) * 100, 1))
p_target <- ggplot(tab_target, aes(x = pago_oportuno_bin, y = n,
fill = pago_oportuno_bin)) +
geom_col(alpha = 0.9, show.legend = FALSE) +
geom_text(aes(label = paste0(comma(n), "\n(", pct, "%)")),
vjust = -0.3, size = 4, fontface = "bold") +
scale_fill_manual(values = c("Mora" = C2, "Oportuno" = C4)) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
labs(title = "Variable Objetivo: Pago_Oportuno_Bin",
subtitle = "Desbalance severo: 98,1% oportuno vs 1,9% mora",
x = NULL, y = "Registros") +
tema_base
# Arrears por condición de pago
p_arr_terms <- df %>%
filter(!is.na(arrears_calc), !is.na(terms_grupo)) %>%
ggplot(aes(x = reorder(terms_grupo, arrears_calc, FUN = median),
y = arrears_calc, fill = terms_grupo)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.1, show.legend = FALSE) +
geom_hline(yintercept = 0, linetype = "dashed", color = C2, linewidth = 0.8) +
scale_y_continuous(labels = comma) +
labs(title = "Arrears por Condición de Pago",
subtitle = "Línea roja = 0 días (umbral mora)",
x = "Terms_Grupo", y = "Días de atraso") +
tema_base +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Dispersión: Log_Monto vs Arrears
set.seed(42)
df_sample <- df %>%
filter(!is.na(arrears_calc), !is.na(log_monto)) %>%
slice_sample(n = 5000)
p_scatter <- ggplot(df_sample,
aes(x = log_monto, y = arrears_calc,
color = pago_oportuno_bin)) +
geom_point(alpha = 0.25, size = 0.9) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
scale_color_manual(values = c("Mora" = C2, "Oportuno" = C4),
name = "Estado pago") +
labs(title = "Scatter: Log(Monto) vs Arrears",
subtitle = "Muestra aleatoria de 5.000 registros",
x = "Log(Monto)", y = "Días de atraso (Arrears_calc)") +
tema_base
panel_eda <- (p_target | p_bkt) / (p_arr_terms | p_scatter) +
plot_annotation(
title = "EDA Confirmatorio — Paso 4 Modelado I2C",
theme = theme(plot.title = element_text(face = "bold", size = 14, color = C1))
)
print(panel_eda)

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 3 ▸ PREPARACIÓN DEL DATASET DE MODELADO
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 3 — PREPARACIÓN DEL DATASET\n")
## SECCIÓN 3 — PREPARACIÓN DEL DATASET
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# ── 3.1 Filtrar anomalías y construir dataset limpio ─────────────────────────
df_work <- df %>%
filter(
flag_fecha_inconsistente == 0, # excluir 8 registros con fechas incoherentes
flag_amount_negativo == 0, # excluir 23 registros con monto negativo
!is.na(arrears_calc),
!is.na(pago_oportuno_bin),
!is.na(log_monto),
!is.na(prom_mora_cliente),
!is.na(dias_hasta_vencimiento)
) %>%
as.data.frame()
cat("Registros para modelado:", formatC(nrow(df_work), big.mark = ","), "\n")
## Registros para modelado: 141,638
cat(" (excluidos: 8 fechas inconsistentes + 23 montos negativos + NAs)\n\n")
## (excluidos: 8 fechas inconsistentes + 23 montos negativos + NAs)
# ── 3.2 Definir variables predictoras por modelo ─────────────────────────────
# Features para REGRESIÓN LINEAL (Arrears_calc ~ ...)
VARS_LM <- c(
"log_monto", "prom_mora_cliente", "max_mora_cliente",
"freq_mora_cliente", "desv_mora_cliente",
"dias_hasta_vencimiento", "mes_documento",
"trimestre_documento", "anio_documento",
"antiguedad_documento", "estado_num", "terms_grupo"
)
# Features para CLASIFICACIÓN (Pago_Oportuno_Bin ~ ...)
VARS_CLAS <- c(
"log_monto", "prom_mora_cliente", "max_mora_cliente",
"freq_mora_cliente", "desv_mora_cliente",
"dias_hasta_vencimiento", "mes_documento",
"trimestre_documento", "estado_num", "terms_grupo"
)
cat("Variables predictoras — Regresión lineal :", length(VARS_LM), "\n")
## Variables predictoras — Regresión lineal : 12
cat("Variables predictoras — Clasificación :", length(VARS_CLAS), "\n\n")
## Variables predictoras — Clasificación : 10
# ── 3.3 Transformaciones adicionales para mejorar modelos ────────────────────
df_work <- df_work %>%
mutate(
# Cuadrado de log_monto (captura no-linealidad en monto)
log_monto_sq = log_monto^2,
# Inverso de días hasta vencimiento (plazos cortos tienen mayor impacto)
inv_dias_vcto = if_else(dias_hasta_vencimiento > 0,
1 / dias_hasta_vencimiento, NA_real_),
# Interacción: frecuencia de mora × promedio mora (score compuesto)
score_riesgo = freq_mora_cliente * abs(prom_mora_cliente),
# Monto en millones (más interpretable)
monto_millones = monto_absoluto / 1e6
)
cat("✓ Transformaciones adicionales creadas:\n")
## ✓ Transformaciones adicionales creadas:
cat(" · log_monto_sq = log_monto² (no-linealidad)\n")
## · log_monto_sq = log_monto² (no-linealidad)
cat(" · inv_dias_vcto = 1/días_vencimiento (urgencia del plazo)\n")
## · inv_dias_vcto = 1/días_vencimiento (urgencia del plazo)
cat(" · score_riesgo = freq_mora × |prom_mora| (score compuesto)\n")
## · score_riesgo = freq_mora × |prom_mora| (score compuesto)
cat(" · monto_millones = Amount / 1.000.000\n\n")
## · monto_millones = Amount / 1.000.000
# Convertir terms_grupo a dummies para regresión
df_work$terms_grupo <- as.factor(df_work$terms_grupo)
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 4 ▸ DIVISIÓN TRAIN / TEST (70% / 30%)
# ───────────────────────────────────────────────────────────────────────────────
cat("═══════════════════════════════════════════════════════\n")
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 4 — DIVISIÓN TRAIN / TEST\n")
## SECCIÓN 4 — DIVISIÓN TRAIN / TEST
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
set.seed(42)
n_total <- nrow(df_work)
idx_train <- sample.int(n_total, size = floor(0.70 * n_total))
df_train <- df_work[ idx_train, ]
df_test <- df_work[-idx_train, ]
cat(" Total disponible :", formatC(n_total, big.mark = ","), "\n")
## Total disponible : 141,638
cat(" Train (70%) :", formatC(nrow(df_train), big.mark = ","), "\n")
## Train (70%) : 99,146
cat(" Test (30%) :", formatC(nrow(df_test), big.mark = ","), "\n\n")
## Test (30%) : 42,492
cat("Proporción Pago_Oportuno_Bin en TRAIN:\n")
## Proporción Pago_Oportuno_Bin en TRAIN:
print(prop.table(table(df_train$pago_oportuno_bin)))
##
## Mora Oportuno
## 0.01966796 0.98033204
cat("Proporción Pago_Oportuno_Bin en TEST:\n")
## Proporción Pago_Oportuno_Bin en TEST:
print(prop.table(table(df_test$pago_oportuno_bin)))
##
## Mora Oportuno
## 0.0186388 0.9813612
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 5 ▸ MODELO 1 — REGRESIÓN LINEAL MÚLTIPLE (Arrears_calc)
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 5 — MODELO 1: REGRESIÓN LINEAL (Arrears)\n")
## SECCIÓN 5 — MODELO 1: REGRESIÓN LINEAL (Arrears)
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# ── 5.1 Fórmula del modelo completo ──────────────────────────────────────────
formula_lm <- as.formula(
paste("arrears_calc ~",
paste(c(VARS_LM, "log_monto_sq", "inv_dias_vcto", "score_riesgo"),
collapse = " + "))
)
cat("Fórmula modelo lineal:\n"); print(formula_lm)
## Fórmula modelo lineal:
## arrears_calc ~ log_monto + prom_mora_cliente + max_mora_cliente +
## freq_mora_cliente + desv_mora_cliente + dias_hasta_vencimiento +
## mes_documento + trimestre_documento + anio_documento + antiguedad_documento +
## estado_num + terms_grupo + log_monto_sq + inv_dias_vcto +
## score_riesgo
# ── 5.2 Ajuste del modelo ────────────────────────────────────────────────────
cat("\nAjustando modelo de regresión lineal...\n")
##
## Ajustando modelo de regresión lineal...
modelo_lm <- lm(formula_lm, data = df_train)
cat("\n── Resumen del modelo completo ──\n")
##
## ── Resumen del modelo completo ──
print(summary(modelo_lm))
##
## Call:
## lm(formula = formula_lm, data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.19 -0.48 0.17 0.80 564.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.909e+03 3.047e+03 0.627 0.53097
## log_monto 2.320e-01 2.693e-01 0.862 0.38893
## prom_mora_cliente 4.640e-01 3.623e-02 12.808 < 2e-16 ***
## max_mora_cliente -2.835e-03 1.759e-03 -1.612 0.10696
## freq_mora_cliente -8.485e+00 4.939e+00 -1.718 0.08578 .
## desv_mora_cliente 1.186e-01 1.389e-02 8.539 < 2e-16 ***
## dias_hasta_vencimiento -4.424e-03 7.717e-03 -0.573 0.56645
## mes_documento -6.838e-01 1.287e-01 -5.311 1.09e-07 ***
## trimestre_documento 2.179e+00 1.326e-01 16.435 < 2e-16 ***
## anio_documento -9.443e-01 1.504e+00 -0.628 0.53008
## antiguedad_documento -1.953e-03 4.110e-03 -0.475 0.63466
## estado_num 5.613e+00 1.787e-01 31.409 < 2e-16 ***
## terms_grupoZ521 7.821e-01 2.950e-01 2.651 0.00803 **
## terms_grupoZ522 8.998e-01 3.635e-01 2.476 0.01330 *
## terms_grupoZ526 -4.872e-01 2.755e-01 -1.769 0.07694 .
## terms_grupoZ540 -9.733e-01 3.609e-01 -2.697 0.00700 **
## terms_grupoZ913 -4.902e+00 4.357e-01 -11.251 < 2e-16 ***
## terms_grupoZ914 -7.835e+00 5.612e-01 -13.960 < 2e-16 ***
## log_monto_sq -8.042e-03 8.698e-03 -0.925 0.35519
## inv_dias_vcto 9.906e-01 3.616e+00 0.274 0.78411
## score_riesgo 7.836e-01 4.774e-01 1.641 0.10072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.84 on 99121 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.3283, Adjusted R-squared: 0.3281
## F-statistic: 2422 on 20 and 99121 DF, p-value: < 2.2e-16
# Intervalos de confianza (95%)
cat("\n── Intervalos de confianza (95%) ──\n")
##
## ── Intervalos de confianza (95%) ──
print(confint(modelo_lm))
## 2.5 % 97.5 %
## (Intercept) -4.063418e+03 7.881810e+03
## log_monto -2.958383e-01 7.599304e-01
## prom_mora_cliente 3.930169e-01 5.350400e-01
## max_mora_cliente -6.282501e-03 6.119953e-04
## freq_mora_cliente -1.816521e+01 1.194566e+00
## desv_mora_cliente 9.141218e-02 1.458773e-01
## dias_hasta_vencimiento -1.954974e-02 1.070141e-02
## mes_documento -9.360904e-01 -4.314455e-01
## trimestre_documento 1.919049e+00 2.438764e+00
## anio_documento -3.891820e+00 2.003315e+00
## antiguedad_documento -1.000811e-02 6.102293e-03
## estado_num 5.262637e+00 5.963141e+00
## terms_grupoZ521 2.038351e-01 1.360280e+00
## terms_grupoZ522 1.874727e-01 1.612211e+00
## terms_grupoZ526 -1.027146e+00 5.268713e-02
## terms_grupoZ540 -1.680627e+00 -2.659291e-01
## terms_grupoZ913 -5.756337e+00 -4.048360e+00
## terms_grupoZ914 -8.934659e+00 -6.734681e+00
## log_monto_sq -2.508918e-02 9.005883e-03
## inv_dias_vcto -6.095944e+00 8.077086e+00
## score_riesgo -1.521058e-01 1.719371e+00
# ── 5.3 Diagnóstico gráfico ──────────────────────────────────────────────────
cat("\n── Diagnóstico gráfico del modelo lineal (4 paneles) ──\n")
##
## ── Diagnóstico gráfico del modelo lineal (4 paneles) ──
par(mfrow = c(2, 2))
plot(modelo_lm, col = C1, pch = 19, cex = 0.4,
main = "Diagnóstico Regresión Lineal")

par(mfrow = c(1, 1))
# Test de Durbin-Watson (autocorrelación de residuos)
cat("\n── Test Durbin-Watson ──\n")
##
## ── Test Durbin-Watson ──
dw_result <- dwtest(modelo_lm)
print(dw_result)
##
## Durbin-Watson test
##
## data: modelo_lm
## DW = 1.9976, p-value = 0.3512
## alternative hypothesis: true autocorrelation is greater than 0
cat(sprintf(" DW = %.4f | p-valor = %.4f\n",
dw_result$statistic, dw_result$p.value))
## DW = 1.9976 | p-valor = 0.3512
cat(" Interpretación: DW ≈ 2 → sin autocorrelación (ideal)\n")
## Interpretación: DW ≈ 2 → sin autocorrelación (ideal)
# ── 5.4 Selección stepwise ───────────────────────────────────────────────────
cat("\n── Selección stepwise (AIC) ──\n")
##
## ── Selección stepwise (AIC) ──
modelo_lm_step <- step(modelo_lm, direction = "both", trace = 0)
cat("\n► Modelo stepwise final:\n")
##
## ► Modelo stepwise final:
print(summary(modelo_lm_step))
##
## Call:
## lm(formula = arrears_calc ~ prom_mora_cliente + max_mora_cliente +
## freq_mora_cliente + desv_mora_cliente + mes_documento + trimestre_documento +
## anio_documento + estado_num + terms_grupo + score_riesgo,
## data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.62 -0.47 0.17 0.80 564.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 476.000894 61.257554 7.770 7.89e-15 ***
## prom_mora_cliente 0.464679 0.035932 12.932 < 2e-16 ***
## max_mora_cliente -0.002889 0.001752 -1.649 0.099203 .
## freq_mora_cliente -8.805533 4.924013 -1.788 0.073733 .
## desv_mora_cliente 0.118590 0.013805 8.590 < 2e-16 ***
## mes_documento -0.627573 0.043115 -14.556 < 2e-16 ***
## trimestre_documento 2.186103 0.132109 16.548 < 2e-16 ***
## anio_documento -0.236238 0.030278 -7.802 6.14e-15 ***
## estado_num 5.613796 0.178644 31.424 < 2e-16 ***
## terms_grupoZ521 0.781897 0.283029 2.763 0.005735 **
## terms_grupoZ522 0.934178 0.356563 2.620 0.008796 **
## terms_grupoZ526 -0.523661 0.267871 -1.955 0.050597 .
## terms_grupoZ540 -1.090826 0.328353 -3.322 0.000894 ***
## terms_grupoZ913 -4.869068 0.418688 -11.629 < 2e-16 ***
## terms_grupoZ914 -7.863899 0.554930 -14.171 < 2e-16 ***
## score_riesgo 0.816300 0.476233 1.714 0.086517 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.84 on 99126 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.3283, Adjusted R-squared: 0.3282
## F-statistic: 3229 on 15 and 99126 DF, p-value: < 2.2e-16
cat(sprintf("\n AIC modelo completo : %.2f\n", AIC(modelo_lm)))
##
## AIC modelo completo : 753897.09
cat(sprintf(" AIC modelo stepwise : %.2f\n", AIC(modelo_lm_step)))
## AIC modelo stepwise : 753889.48
# ── 5.5 Mejor subconjunto (regsubsets) ───────────────────────────────────────
cat("\n── Mejor subconjunto (leaps::regsubsets) ──\n")
##
## ── Mejor subconjunto (leaps::regsubsets) ──
# Construir X solo con variables numéricas para regsubsets
vars_num_lm <- c("log_monto","prom_mora_cliente","max_mora_cliente",
"freq_mora_cliente","desv_mora_cliente",
"dias_hasta_vencimiento","mes_documento",
"anio_documento","estado_num",
"log_monto_sq","score_riesgo")
X_lm <- df_train[, vars_num_lm] %>% drop_na() %>% as.matrix()
y_lm <- df_train$arrears_calc[complete.cases(df_train[, vars_num_lm])]
best_sub <- regsubsets(x = X_lm, y = y_lm,
nbest = 1, nvmax = 8, method = "exhaustive")
best_sum <- summary(best_sub)
cat("\nR² ajustado por número de variables:\n")
##
## R² ajustado por número de variables:
print(round(best_sum$adjr2, 4))
## [1] 0.3185 0.3237 0.3241 0.3243 0.3245 0.3245 0.3246 0.3246
cat("\nBIC por número de variables:\n")
##
## BIC por número de variables:
print(round(best_sum$bic, 3))
## [1] -37994.43 -38737.89 -38789.08 -38816.43 -38824.52 -38825.43 -38821.00
## [8] -38809.59
cat("\nCp por número de variables:\n")
##
## Cp por número de variables:
print(round(best_sum$cp, 3))
## [1] 892.263 135.430 74.678 37.798 20.207 9.790 4.717 6.622
n_opt <- which.min(best_sum$bic)
cat(sprintf("\n✓ Variables óptimas según BIC mínimo: %d\n", n_opt))
##
## ✓ Variables óptimas según BIC mínimo: 6
cat(" Variables seleccionadas:\n")
## Variables seleccionadas:
print(best_sum$which[n_opt, ])
## (Intercept) log_monto prom_mora_cliente
## TRUE FALSE TRUE
## max_mora_cliente freq_mora_cliente desv_mora_cliente
## FALSE FALSE TRUE
## dias_hasta_vencimiento mes_documento anio_documento
## TRUE TRUE TRUE
## estado_num log_monto_sq score_riesgo
## TRUE FALSE FALSE
# ── 5.6 Predicción y métricas ────────────────────────────────────────────────
# predict(newdata = df_train) puede devolver menos filas que df_train si
# el modelo eliminó NAs internamente. Se usa fitted() para train y
# predict() solo para test (donde sí podemos controlar los NAs).
pred_lm_test <- predict(modelo_lm_step, newdata = df_test)
# Para train: usar los valores ajustados directamente del modelo
# y los valores reales de las filas efectivamente usadas en el ajuste
idx_used_train <- as.integer(rownames(model.frame(modelo_lm_step)))
y_train_used <- df_train$arrears_calc[idx_used_train]
pred_lm_train <- fitted(modelo_lm_step)
RMSE <- function(y, yhat) sqrt(mean((y - yhat)^2, na.rm = TRUE))
MAE <- function(y, yhat) mean(abs(y - yhat), na.rm = TRUE)
R2 <- function(y, yhat) cor(y, yhat, use = "complete.obs")^2
rmse_tr <- RMSE(y_train_used, pred_lm_train)
rmse_te <- RMSE(df_test$arrears_calc, pred_lm_test)
mae_tr <- MAE(y_train_used, pred_lm_train)
mae_te <- MAE(df_test$arrears_calc, pred_lm_test)
r2_te <- R2(df_test$arrears_calc, pred_lm_test)
cat("\n── Métricas Regresión Lineal ──────────────────────\n")
##
## ── Métricas Regresión Lineal ──────────────────────
cat(sprintf(" RMSE Train : %.4f días\n", rmse_tr))
## RMSE Train : 15.3612 días
cat(sprintf(" RMSE Test : %.4f días ← indicador principal\n", rmse_te))
## RMSE Test : 8.8456 días ← indicador principal
cat(sprintf(" MAE Train : %.4f días\n", mae_tr))
## MAE Train : 8.2739 días
cat(sprintf(" MAE Test : %.4f días\n", mae_te))
## MAE Test : 1.5695 días
cat(sprintf(" R² Test : %.6f\n", r2_te))
## R² Test : 0.406928
# Gráfico real vs predicho
p_lm <- data.frame(real = df_test$arrears_calc, pred = pred_lm_test) %>%
ggplot(aes(x = real, y = pred)) +
geom_point(alpha = 0.12, size = 0.7, color = C1) +
geom_abline(slope = 1, intercept = 0, color = C2,
linetype = "dashed", linewidth = 1) +
geom_smooth(method = "lm", color = C4, linewidth = 0.8, se = FALSE) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(title = "Regresión Lineal: Arrears Real vs Predicho",
subtitle = sprintf("RMSE = %.4f días | R² = %.6f", rmse_te, r2_te),
caption = "Línea roja = predicción perfecta | Línea verde = tendencia real",
x = "Arrears real (días)", y = "Arrears predicho (días)") +
tema_base
print(p_lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Gráfico de residuos
# NOTA: fitted() y residuals() se extraen directamente del modelo para
# garantizar que ambos vectores tienen exactamente las mismas filas.
# lm() elimina internamente filas con NAs, por lo que predict(newdata=df_train)
# puede devolver más filas que residuals() — ese desajuste causaba el error.
p_resid <- data.frame(
fitted = fitted(modelo_lm_step), # valores ajustados del modelo
resid = residuals(modelo_lm_step) # residuos del mismo modelo
) %>%
ggplot(aes(x = fitted, y = resid)) +
geom_point(alpha = 0.1, size = 0.6, color = C1) +
geom_hline(yintercept = 0, color = C2, linetype = "dashed") +
geom_smooth(method = "loess", color = C3, linewidth = 0.8, se = FALSE) +
labs(title = "Residuos vs Valores Ajustados",
subtitle = "Sin patrón sistemático = buen ajuste",
x = "Valores ajustados", y = "Residuos") +
tema_base
print(p_resid)
## `geom_smooth()` using formula = 'y ~ x'

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 6 ▸ MODELO 2 — REGRESIÓN LOGÍSTICA (Pago_Oportuno_Bin — datos originales)
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 6 — MODELO 2: REGRESIÓN LOGÍSTICA (original)\n")
## SECCIÓN 6 — MODELO 2: REGRESIÓN LOGÍSTICA (original)
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
cat("NOTA: Con 98,1% vs 1,9% de desbalance, este modelo sirve\n")
## NOTA: Con 98,1% vs 1,9% de desbalance, este modelo sirve
cat("como línea base. Los modelos con datos balanceados (SEC 8-9)\n")
## como línea base. Los modelos con datos balanceados (SEC 8-9)
cat("son los definitivos para comparación.\n\n")
## son los definitivos para comparación.
formula_logit <- as.formula(
paste("pago_oportuno_bin ~",
paste(c(VARS_CLAS, "score_riesgo"), collapse = " + "))
)
cat("Ajustando regresión logística (datos originales)...\n")
## Ajustando regresión logística (datos originales)...
modelo_logit <- glm(formula_logit,
data = df_train,
family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat("\n── Resumen modelo logístico ──\n")
##
## ── Resumen modelo logístico ──
print(summary(modelo_logit))
##
## Call:
## glm(formula = formula_logit, family = binomial(link = "logit"),
## data = df_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.010e+02 1.593e+03 0.063 0.94949
## log_monto -7.922e-03 3.493e-02 -0.227 0.82057
## prom_mora_cliente -2.298e-01 5.653e-02 -4.064 4.82e-05 ***
## max_mora_cliente -8.771e-03 4.061e-03 -2.160 0.03081 *
## freq_mora_cliente 9.528e+00 3.081e+00 3.092 0.00199 **
## desv_mora_cliente 1.036e-01 5.702e-02 1.818 0.06911 .
## dias_hasta_vencimiento 4.318e-01 2.819e-02 15.316 < 2e-16 ***
## mes_documento 1.126e+01 6.404e-01 17.587 < 2e-16 ***
## trimestre_documento -5.032e+01 3.782e+02 -0.133 0.89414
## estado_num -9.665e+01 1.240e+03 -0.078 0.93788
## terms_grupoZ521 -1.587e+00 2.658e+00 -0.597 0.55051
## terms_grupoZ522 -2.076e+00 2.701e+00 -0.769 0.44212
## terms_grupoZ526 2.139e+00 2.658e+00 0.805 0.42104
## terms_grupoZ540 -2.224e+00 2.726e+00 -0.816 0.41464
## terms_grupoZ913 -2.108e+00 2.717e+00 -0.776 0.43794
## terms_grupoZ914 1.986e-01 2.778e+00 0.072 0.94299
## score_riesgo -4.755e+00 6.108e-01 -7.784 7.04e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19183.4 on 99141 degrees of freedom
## Residual deviance: 1941.5 on 99125 degrees of freedom
## (4 observations deleted due to missingness)
## AIC: 1975.5
##
## Number of Fisher Scoring iterations: 25
# Odds Ratios
cat("\n── Odds Ratios e IC 95% ──\n")
##
## ── Odds Ratios e IC 95% ──
or_df <- data.frame(
OR = round(exp(coef(modelo_logit)), 4),
IC_inf = round(exp(confint(modelo_logit))[, 1], 4),
IC_sup = round(exp(confint(modelo_logit))[, 2], 4)
)
## Waiting for profiling to be done...
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Waiting for profiling to be done...
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(or_df)
## OR IC_inf IC_sup
## (Intercept) 6.962089e+43 Inf 8.918136e+245
## log_monto 9.921000e-01 0.9265 1.062500e+00
## prom_mora_cliente 7.947000e-01 0.7076 8.843000e-01
## max_mora_cliente 9.913000e-01 0.9834 9.992000e-01
## freq_mora_cliente 1.373822e+04 38.8387 7.172688e+06
## desv_mora_cliente 1.109200e+00 0.9945 1.244400e+00
## dias_hasta_vencimiento 1.540000e+00 1.4600 1.630600e+00
## mes_documento 7.780116e+04 23663.6557 2.918319e+05
## trimestre_documento 0.000000e+00 0.0000 0.000000e+00
## estado_num 0.000000e+00 0.0000 0.000000e+00
## terms_grupoZ521 2.045000e-01 0.0054 6.546900e+00
## terms_grupoZ522 1.255000e-01 0.0027 4.734200e+00
## terms_grupoZ526 8.487600e+00 0.2587 3.108893e+02
## terms_grupoZ540 1.082000e-01 0.0021 4.542100e+00
## terms_grupoZ913 1.215000e-01 0.0025 5.044200e+00
## terms_grupoZ914 1.219700e+00 0.0219 7.040810e+01
## score_riesgo 8.600000e-03 0.0025 2.770000e-02
cat(sprintf("\n AIC modelo logístico completo: %.2f\n", AIC(modelo_logit)))
##
## AIC modelo logístico completo: 1975.53
# Stepwise logístico
cat("\n── Stepwise logístico (AIC) ──\n")
##
## ── Stepwise logístico (AIC) ──
modelo_logit_step <- step(modelo_logit, direction = "both", trace = 0)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat(sprintf(" AIC modelo stepwise: %.2f\n", AIC(modelo_logit_step)))
## AIC modelo stepwise: 1973.58
# Evaluación en TEST
prob_logit <- predict(modelo_logit_step, newdata = df_test, type = "response")
clase_logit <- factor(if_else(prob_logit >= 0.5, "Oportuno", "Mora"),
levels = c("Mora", "Oportuno"))
cat("\n── Matriz de Confusión — Logística (umbral 0,5) ──\n")
##
## ── Matriz de Confusión — Logística (umbral 0,5) ──
cm_logit <- confusionMatrix(data = clase_logit,
reference = df_test$pago_oportuno_bin,
positive = "Mora")
print(cm_logit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Mora Oportuno
## Mora 685 65
## Oportuno 107 41633
##
## Accuracy : 0.996
## 95% CI : (0.9953, 0.9965)
## No Information Rate : 0.9814
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8864
##
## Mcnemar's Test P-Value : 0.001771
##
## Sensitivity : 0.86490
## Specificity : 0.99844
## Pos Pred Value : 0.91333
## Neg Pred Value : 0.99744
## Prevalence : 0.01864
## Detection Rate : 0.01612
## Detection Prevalence : 0.01765
## Balanced Accuracy : 0.93167
##
## 'Positive' Class : Mora
##
cat("\nTabla:\n"); print(cm_logit$table)
##
## Tabla:
## Reference
## Prediction Mora Oportuno
## Mora 685 65
## Oportuno 107 41633
cat("\nMétricas clase:\n"); print(cm_logit$byClass)
##
## Métricas clase:
## Sensitivity Specificity Pos Pred Value
## 0.86489899 0.99844117 0.91333333
## Neg Pred Value Precision Recall
## 0.99743651 0.91333333 0.86489899
## F1 Prevalence Detection Rate
## 0.88845655 0.01863968 0.01612144
## Detection Prevalence Balanced Accuracy
## 0.01765121 0.93167008
cat("\nMétricas global:\n");print(cm_logit$overall)
##
## Métricas global:
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.959520e-01 8.863967e-01 9.953013e-01 9.965334e-01 9.813603e-01
## AccuracyPValue McnemarPValue
## 2.532200e-159 1.770686e-03
# Curva ROC
roc_df <- data.frame(
real = as.integer(df_test$pago_oportuno_bin == "Mora"),
prob = prob_logit
) %>%
arrange(desc(prob)) %>%
mutate(
tpr = cumsum(real) / sum(real),
fpr = cumsum(1 - real) / sum(1 - real)
)
auc_logit <- with(roc_df, {
n <- nrow(roc_df)
abs(sum(diff(fpr) * (tpr[-n] + tpr[-1]) / 2))
})
p_roc1 <- ggplot(roc_df, aes(x = fpr, y = tpr)) +
geom_line(color = C1, linewidth = 1.2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray60") +
annotate("text", x = 0.6, y = 0.1,
label = sprintf("AUC = %.4f", auc_logit),
size = 4.5, fontface = "bold", color = C1) +
scale_x_continuous(labels = percent) +
scale_y_continuous(labels = percent) +
labs(title = "ROC — Logística (datos originales)",
subtitle = "Clase positiva: Mora",
x = "Tasa Falsos Positivos", y = "Tasa Verdaderos Positivos") +
tema_base
print(p_roc1)

cat(sprintf("\n AUC logística (original): %.4f\n", auc_logit))
##
## AUC logística (original): 0.0008
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 7 ▸ BALANCEO DE CLASES
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 7 — BALANCEO DE CLASES\n")
## SECCIÓN 7 — BALANCEO DE CLASES
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
cat("Distribución ORIGINAL en train:\n")
## Distribución ORIGINAL en train:
print(table(df_train$pago_oportuno_bin))
##
## Mora Oportuno
## 1950 97196
print(prop.table(table(df_train$pago_oportuno_bin)))
##
## Mora Oportuno
## 0.01966796 0.98033204
# Variables para balanceo (solo numéricas)
VARS_BAL <- c("pago_oportuno_bin", "log_monto", "prom_mora_cliente",
"max_mora_cliente", "freq_mora_cliente", "desv_mora_cliente",
"dias_hasta_vencimiento", "mes_documento",
"estado_num", "score_riesgo")
# Muestra estratificada para balanceo (eficiencia computacional)
# Tomar TODOS los casos de mora + muestra de oportuno
idx_mora <- which(df_train$pago_oportuno_bin == "Mora")
idx_oportuno <- which(df_train$pago_oportuno_bin == "Oportuno")
set.seed(42)
# Submuestreo del oportuno: 10x los casos de mora para balanceo razonable
n_oportuno_sample <- min(length(idx_oportuno), length(idx_mora) * 10)
idx_oportuno_sel <- sample(idx_oportuno, n_oportuno_sample)
df_bal_input <- df_train[c(idx_mora, idx_oportuno_sel), VARS_BAL] %>%
mutate(pago_oportuno_bin = as.factor(as.character(pago_oportuno_bin))) %>%
drop_na()
cat("\nMuestra para balanceo:\n")
##
## Muestra para balanceo:
cat(" Mora :", sum(df_bal_input$pago_oportuno_bin == "Mora"), "\n")
## Mora : 1950
cat(" Oportuno :", sum(df_bal_input$pago_oportuno_bin == "Oportuno"), "\n")
## Oportuno : 19500
# ── 7.1 ROSE ─────────────────────────────────────────────────────────────────
cat("\n── Método 1: ROSE (Random Over-Sampling Examples) ──\n")
##
## ── Método 1: ROSE (Random Over-Sampling Examples) ──
set.seed(42)
df_rose <- ovun.sample(
pago_oportuno_bin ~ .,
data = df_bal_input,
method = "both",
p = 0.5,
N = nrow(df_bal_input),
seed = 42
)$data
cat("Distribución después de ROSE:\n")
## Distribución después de ROSE:
print(table(df_rose$pago_oportuno_bin))
##
## Oportuno Mora
## 10715 10735
print(prop.table(table(df_rose$pago_oportuno_bin)))
##
## Oportuno Mora
## 0.4995338 0.5004662
# ── 7.2 SMOTE ────────────────────────────────────────────────────────────────
cat("\n── Método 2: SMOTE (Synthetic Minority Over-sampling) ──\n")
##
## ── Método 2: SMOTE (Synthetic Minority Over-sampling) ──
df_smote_input <- df_bal_input %>%
mutate(target_num = as.integer(pago_oportuno_bin == "Mora")) %>%
select(-pago_oportuno_bin) %>%
as.data.frame()
set.seed(42)
smote_res <- SMOTE(
X = df_smote_input[, names(df_smote_input) != "target_num"],
target = df_smote_input$target_num,
K = 5,
dup_size = 0
)
df_smote <- smote_res$data %>%
as.data.frame() %>%
rename(target_num = class) %>%
mutate(pago_oportuno_bin = factor(
if_else(as.numeric(as.character(target_num)) == 1, "Mora", "Oportuno"),
levels = c("Mora", "Oportuno")
)) %>%
select(-target_num)
colnames(df_smote) <- make.names(colnames(df_smote))
cat("Distribución después de SMOTE:\n")
## Distribución después de SMOTE:
print(table(df_smote$pago_oportuno_bin))
##
## Mora Oportuno
## 19500 19500
print(prop.table(table(df_smote$pago_oportuno_bin)))
##
## Mora Oportuno
## 0.5 0.5
# Gráfico comparativo de distribuciones
par(mfrow = c(1, 3))
barplot(table(df_bal_input$pago_oportuno_bin),
main = "ORIGINAL (muestra)", col = c(C2, C4), border = NA,
names.arg = c("Mora", "Oportuno"),
ylab = "Registros")
barplot(table(df_rose$pago_oportuno_bin),
main = "ROSE", col = c(C2, C4), border = NA,
names.arg = c("Mora", "Oportuno"))
barplot(table(df_smote$pago_oportuno_bin),
main = "SMOTE", col = c(C2, C4), border = NA,
names.arg = c("Mora", "Oportuno"))

par(mfrow = c(1, 1))
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 8 ▸ MODELO 3 — REGRESIÓN LOGÍSTICA CON DATOS BALANCEADOS (ROSE)
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 8 — MODELO 3: LOGÍSTICA + ROSE\n")
## SECCIÓN 8 — MODELO 3: LOGÍSTICA + ROSE
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# Fórmula para datos balanceados (sin terms_grupo por restricción de variables)
VARS_BAL_CLAS <- c("log_monto","prom_mora_cliente","max_mora_cliente",
"freq_mora_cliente","desv_mora_cliente",
"dias_hasta_vencimiento","mes_documento",
"estado_num","score_riesgo")
formula_bal <- as.formula(
paste("pago_oportuno_bin ~",
paste(intersect(VARS_BAL_CLAS, names(df_rose)), collapse = " + "))
)
cat("Ajustando logística con datos ROSE...\n")
## Ajustando logística con datos ROSE...
modelo_logit_rose <- glm(formula_bal,
data = df_rose,
family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat("\n── Resumen modelo logístico (ROSE) ──\n")
##
## ── Resumen modelo logístico (ROSE) ──
print(summary(modelo_logit_rose))
##
## Call:
## glm(formula = formula_bal, family = binomial(link = "logit"),
## data = df_rose)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -24.235706 267.171939 -0.091 0.928
## log_monto -0.020509 0.038021 -0.539 0.590
## prom_mora_cliente 0.149331 0.015776 9.466 < 2e-16 ***
## max_mora_cliente 0.014948 0.001607 9.301 < 2e-16 ***
## freq_mora_cliente 6.431393 6.656123 0.966 0.334
## desv_mora_cliente -0.027596 0.020252 -1.363 0.173
## dias_hasta_vencimiento -0.048665 0.008766 -5.551 2.84e-08 ***
## mes_documento -0.697432 0.066285 -10.522 < 2e-16 ***
## estado_num 30.970873 267.171157 0.116 0.908
## score_riesgo 5.187773 0.682323 7.603 2.89e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29736.0 on 21449 degrees of freedom
## Residual deviance: 2178.4 on 21440 degrees of freedom
## AIC: 2198.4
##
## Number of Fisher Scoring iterations: 21
# Stepwise
modelo_logit_rose_step <- step(modelo_logit_rose, direction = "both", trace = 0)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat(sprintf("\n AIC logística ROSE completo : %.2f\n", AIC(modelo_logit_rose)))
##
## AIC logística ROSE completo : 2198.44
cat(sprintf(" AIC logística ROSE stepwise : %.2f\n", AIC(modelo_logit_rose_step)))
## AIC logística ROSE stepwise : 2195.35
# Evaluación en TEST original (sin balancear — rendimiento real)
prob_rose <- predict(modelo_logit_rose_step, newdata = df_test, type = "response")
clase_rose <- factor(if_else(prob_rose >= 0.5, "Mora", "Oportuno"),
levels = c("Mora", "Oportuno"))
cat("\n── Matriz de Confusión — Logística ROSE (test original) ──\n")
##
## ── Matriz de Confusión — Logística ROSE (test original) ──
cm_rose <- confusionMatrix(data = clase_rose,
reference = df_test$pago_oportuno_bin,
positive = "Mora")
print(cm_rose)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Mora Oportuno
## Mora 790 1019
## Oportuno 2 40681
##
## Accuracy : 0.976
## 95% CI : (0.9745, 0.9774)
## No Information Rate : 0.9814
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.597
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99747
## Specificity : 0.97556
## Pos Pred Value : 0.43671
## Neg Pred Value : 0.99995
## Prevalence : 0.01864
## Detection Rate : 0.01859
## Detection Prevalence : 0.04257
## Balanced Accuracy : 0.98652
##
## 'Positive' Class : Mora
##
cat("\nTabla:\n"); print(cm_rose$table)
##
## Tabla:
## Reference
## Prediction Mora Oportuno
## Mora 790 1019
## Oportuno 2 40681
cat("\nMétricas clase:\n"); print(cm_rose$byClass)
##
## Métricas clase:
## Sensitivity Specificity Pos Pred Value
## 0.99747475 0.97556355 0.43670536
## Neg Pred Value Precision Recall
## 0.99995084 0.43670536 0.99747475
## F1 Prevalence Detection Rate
## 0.60745867 0.01863880 0.01859173
## Detection Prevalence Balanced Accuracy
## 0.04257272 0.98651915
cat("\nMétricas global:\n");print(cm_rose$overall)
##
## Métricas global:
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.759719e-01 5.970105e-01 9.744712e-01 9.774072e-01 9.813612e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 7.210296e-222
# ROC logística ROSE
roc_rose <- data.frame(
real = as.integer(df_test$pago_oportuno_bin == "Mora"),
prob = prob_rose
) %>%
arrange(desc(prob)) %>%
mutate(
tpr = cumsum(real) / sum(real),
fpr = cumsum(1 - real) / sum(1 - real)
)
auc_rose <- with(roc_rose, abs(sum(diff(fpr) * (tpr[-nrow(roc_rose)] + tpr[-1]) / 2)))
cat(sprintf("\n AUC logística ROSE: %.4f\n", auc_rose))
##
## AUC logística ROSE: 0.9966
p_roc2 <- ggplot(roc_rose, aes(x = fpr, y = tpr)) +
geom_line(color = C4, linewidth = 1.2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray60") +
annotate("text", x = 0.6, y = 0.1,
label = sprintf("AUC = %.4f", auc_rose),
size = 4.5, fontface = "bold", color = C4) +
scale_x_continuous(labels = percent) +
scale_y_continuous(labels = percent) +
labs(title = "ROC — Logística con ROSE",
subtitle = "Clase positiva: Mora | Test original (sin balancear)",
x = "Tasa Falsos Positivos", y = "Tasa Verdaderos Positivos") +
tema_base
print(p_roc2)

# Explorar umbral óptimo para maximizar F1 en clase Mora
umbrales <- seq(0.01, 0.99, by = 0.01)
f1_umbral <- sapply(umbrales, function(u) {
pred_u <- factor(if_else(prob_rose >= u, "Mora", "Oportuno"),
levels = c("Mora", "Oportuno"))
cm_u <- suppressWarnings(
confusionMatrix(pred_u, df_test$pago_oportuno_bin, positive = "Mora"))
unname(cm_u$byClass["F1"])
})
umbral_optimo <- umbrales[which.max(replace(f1_umbral, is.na(f1_umbral), 0))]
cat(sprintf("\n Umbral óptimo para maximizar F1 (Mora): %.2f\n", umbral_optimo))
##
## Umbral óptimo para maximizar F1 (Mora): 0.97
cat(sprintf(" F1 máximo : %.4f\n",
max(f1_umbral, na.rm = TRUE)))
## F1 máximo : 0.7540
clase_rose_opt <- factor(if_else(prob_rose >= umbral_optimo, "Mora", "Oportuno"),
levels = c("Mora", "Oportuno"))
cat("\n── Matriz de Confusión — ROSE (umbral óptimo) ──\n")
##
## ── Matriz de Confusión — ROSE (umbral óptimo) ──
cm_rose_opt <- confusionMatrix(clase_rose_opt, df_test$pago_oportuno_bin,
positive = "Mora")
print(cm_rose_opt)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Mora Oportuno
## Mora 633 254
## Oportuno 159 41446
##
## Accuracy : 0.9903
## 95% CI : (0.9893, 0.9912)
## No Information Rate : 0.9814
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7491
##
## Mcnemar's Test P-Value : 3.738e-06
##
## Sensitivity : 0.79924
## Specificity : 0.99391
## Pos Pred Value : 0.71364
## Neg Pred Value : 0.99618
## Prevalence : 0.01864
## Detection Rate : 0.01490
## Detection Prevalence : 0.02087
## Balanced Accuracy : 0.89658
##
## 'Positive' Class : Mora
##
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 9 ▸ MODELO 4 — k-NN CON DATOS BALANCEADOS
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 9 — MODELO 4: k-NN\n")
## SECCIÓN 9 — MODELO 4: k-NN
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# Variables numéricas para kNN (requiere solo numéricas)
VARS_KNN <- intersect(
c("log_monto","prom_mora_cliente","max_mora_cliente",
"freq_mora_cliente","desv_mora_cliente",
"dias_hasta_vencimiento","mes_documento",
"estado_num","score_riesgo"),
names(df_smote)
)
# Preparar matrices de entrenamiento (SMOTE) y test (original)
X_train_knn <- df_smote %>%
select(all_of(make.names(VARS_KNN))) %>%
mutate(across(everything(), as.numeric)) %>%
drop_na() %>%
as.matrix()
y_train_knn <- df_smote$pago_oportuno_bin[
complete.cases(df_smote %>% select(all_of(make.names(VARS_KNN))))
]
X_test_knn <- df_test %>%
select(all_of(VARS_KNN)) %>%
mutate(across(everything(), as.numeric)) %>%
as.matrix()
y_test_knn <- df_test$pago_oportuno_bin
# Normalización min-max (OBLIGATORIA para kNN)
norm_minmax <- function(x) {
mn <- min(x, na.rm = TRUE); mx <- max(x, na.rm = TRUE)
if (mx == mn) return(rep(0, length(x)))
(x - mn) / (mx - mn)
}
X_train_norm <- apply(X_train_knn, 2, norm_minmax)
X_test_norm <- apply(X_test_knn, 2, norm_minmax)
# Eliminar NAs del conjunto de test
idx_ok <- complete.cases(X_test_norm)
X_test_norm <- X_test_norm[idx_ok, ]
y_test_knn <- y_test_knn[idx_ok]
cat(" Train kNN (SMOTE) :", nrow(X_train_norm), "registros\n")
## Train kNN (SMOTE) : 39000 registros
cat(" Test kNN (original):", nrow(X_test_norm), "registros\n\n")
## Test kNN (original): 42490 registros
# ── 9.1 Selección del K óptimo ────────────────────────────────────────────────
cat("Evaluando K = 1, 3, 5, 7, 9, 11, 13, 15...\n")
## Evaluando K = 1, 3, 5, 7, 9, 11, 13, 15...
k_vals <- c(1, 3, 5, 7, 9, 11, 13, 15)
acc_knn <- numeric(length(k_vals))
f1_knn_v <- numeric(length(k_vals))
kap_knn <- numeric(length(k_vals))
for (i in seq_along(k_vals)) {
set.seed(42)
pred_k <- knn(train = X_train_norm, test = X_test_norm,
cl = y_train_knn, k = k_vals[i])
pred_k <- factor(pred_k, levels = c("Mora","Oportuno"))
cm_k <- suppressWarnings(
confusionMatrix(pred_k, y_test_knn, positive = "Mora"))
acc_knn[i] <- cm_k$overall["Accuracy"]
f1_knn_v[i] <- cm_k$byClass["F1"]
kap_knn[i] <- cm_k$overall["Kappa"]
cat(sprintf(" K = %2d | Accuracy = %.4f | F1 = %.4f | Kappa = %.4f\n",
k_vals[i], acc_knn[i],
ifelse(is.na(f1_knn_v[i]), 0, f1_knn_v[i]),
kap_knn[i]))
}
## K = 1 | Accuracy = 0.9810 | F1 = 0.6535 | Kappa = 0.6448
## K = 3 | Accuracy = 0.9725 | F1 = 0.5735 | Kappa = 0.5619
## K = 5 | Accuracy = 0.9708 | F1 = 0.5591 | Kappa = 0.5470
## K = 7 | Accuracy = 0.9704 | F1 = 0.5572 | Kappa = 0.5450
## K = 9 | Accuracy = 0.9699 | F1 = 0.5529 | Kappa = 0.5405
## K = 11 | Accuracy = 0.9696 | F1 = 0.5510 | Kappa = 0.5385
## K = 13 | Accuracy = 0.9693 | F1 = 0.5485 | Kappa = 0.5359
## K = 15 | Accuracy = 0.9694 | F1 = 0.5489 | Kappa = 0.5363
# K óptimo: mayor Kappa (robusto al desbalance)
k_opt <- k_vals[which.max(kap_knn)]
cat(sprintf("\n✓ K óptimo: %d (Kappa = %.4f)\n", k_opt, max(kap_knn)))
##
## ✓ K óptimo: 1 (Kappa = 0.6448)
# Gráfico K vs métricas
p_knn_k <- data.frame(
K = rep(k_vals, 3),
Valor = c(acc_knn, replace(f1_knn_v, is.na(f1_knn_v), 0), kap_knn),
Metrica = rep(c("Accuracy","F1","Kappa"), each = length(k_vals))
) %>%
ggplot(aes(x = K, y = Valor, color = Metrica, group = Metrica)) +
geom_line(linewidth = 1.1) +
geom_point(size = 3) +
geom_vline(xintercept = k_opt, linetype = "dashed", color = C3, linewidth = 0.9) +
annotate("text", x = k_opt + 0.4, y = 0.05,
label = paste("K =", k_opt), color = C3, size = 3.8, fontface = "bold") +
scale_color_manual(values = c("Accuracy" = C1, "F1" = C4, "Kappa" = C5)) +
scale_x_continuous(breaks = k_vals) +
labs(title = "k-NN: Métricas por Valor de K",
subtitle = "Datos SMOTE (train) → Test original (sin balancear)",
x = "K vecinos", y = "Valor de la métrica") +
tema_base
print(p_knn_k)

# ── 9.2 Modelo kNN final ─────────────────────────────────────────────────────
set.seed(42)
pred_knn <- knn(train = X_train_norm, test = X_test_norm,
cl = y_train_knn, k = k_opt)
pred_knn <- factor(pred_knn, levels = c("Mora","Oportuno"))
cat(sprintf("\n── Matriz de Confusión — k-NN (K = %d, SMOTE) ──\n", k_opt))
##
## ── Matriz de Confusión — k-NN (K = 1, SMOTE) ──
cm_knn <- confusionMatrix(pred_knn, y_test_knn, positive = "Mora")
print(cm_knn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Mora Oportuno
## Mora 762 778
## Oportuno 30 40920
##
## Accuracy : 0.981
## 95% CI : (0.9796, 0.9823)
## No Information Rate : 0.9814
## P-Value [Acc > NIR] : 0.7243
##
## Kappa : 0.6448
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.96212
## Specificity : 0.98134
## Pos Pred Value : 0.49481
## Neg Pred Value : 0.99927
## Prevalence : 0.01864
## Detection Rate : 0.01793
## Detection Prevalence : 0.03624
## Balanced Accuracy : 0.97173
##
## 'Positive' Class : Mora
##
cat("\nTabla:\n"); print(cm_knn$table)
##
## Tabla:
## Reference
## Prediction Mora Oportuno
## Mora 762 778
## Oportuno 30 40920
cat("\nMétricas clase:\n"); print(cm_knn$byClass)
##
## Métricas clase:
## Sensitivity Specificity Pos Pred Value
## 0.96212121 0.98134203 0.49480519
## Neg Pred Value Precision Recall
## 0.99926740 0.49480519 0.96212121
## F1 Prevalence Detection Rate
## 0.65351630 0.01863968 0.01793363
## Detection Prevalence Balanced Accuracy
## 0.03624382 0.97173162
cat("\nMétricas global:\n");print(cm_knn$overall)
##
## Métricas global:
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.809838e-01 6.447711e-01 9.796400e-01 9.822612e-01 9.813603e-01
## AccuracyPValue McnemarPValue
## 7.242724e-01 3.301302e-152
# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 10 ▸ COMPARACIÓN Y SELECCIÓN DE MODELOS
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 10 — COMPARACIÓN DE MODELOS\n")
## SECCIÓN 10 — COMPARACIÓN DE MODELOS
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# ── Extraer métricas de cada modelo de clasificación ─────────────────────────
extraer_metricas <- function(cm, nombre, auc = NA) {
data.frame(
Modelo = nombre,
Accuracy = round(cm$overall["Accuracy"], 4),
Kappa = round(cm$overall["Kappa"], 4),
Precision = round(cm$byClass["Precision"], 4),
Recall = round(cm$byClass["Sensitivity"], 4),
F1 = round(cm$byClass["F1"], 4),
Specificity= round(cm$byClass["Specificity"], 4),
AUC = round(auc, 4),
row.names = NULL
)
}
tab_comp <- bind_rows(
extraer_metricas(cm_logit, "Logística (original, u=0.50)", auc_logit),
extraer_metricas(cm_rose, "Logística (ROSE, u=0.50)", auc_rose),
extraer_metricas(cm_rose_opt, sprintf("Logística (ROSE, u=%.2f)", umbral_optimo), auc_rose),
extraer_metricas(cm_knn, sprintf("k-NN (SMOTE, K=%d)", k_opt))
)
cat("── Tabla comparativa de modelos de clasificación ──\n")
## ── Tabla comparativa de modelos de clasificación ──
print(tab_comp, digits = 4)
## Modelo Accuracy Kappa Precision Recall F1
## 1 Logística (original, u=0.50) 0.9960 0.8864 0.9133 0.8649 0.8885
## 2 Logística (ROSE, u=0.50) 0.9760 0.5970 0.4367 0.9975 0.6075
## 3 Logística (ROSE, u=0.97) 0.9903 0.7491 0.7136 0.7992 0.7540
## 4 k-NN (SMOTE, K=1) 0.9810 0.6448 0.4948 0.9621 0.6535
## Specificity AUC
## 1 0.9984 0.0008
## 2 0.9756 0.9966
## 3 0.9939 0.9966
## 4 0.9813 NA
cat("\n── Regresión Lineal (Arrears_calc) ──\n")
##
## ── Regresión Lineal (Arrears_calc) ──
cat(sprintf(" RMSE Test : %.4f días\n", rmse_te))
## RMSE Test : 8.8456 días
cat(sprintf(" MAE Test : %.4f días\n", mae_te))
## MAE Test : 1.5695 días
cat(sprintf(" R² Test : %.6f\n", r2_te))
## R² Test : 0.406928
cat(sprintf(" AIC step. : %.2f\n", AIC(modelo_lm_step)))
## AIC step. : 753889.48
# Modelo recomendado (mejor F1 en clase Mora)
mejor_idx <- which.max(replace(tab_comp$F1, is.na(tab_comp$F1), 0))
cat("\n✓ MODELO RECOMENDADO (mayor F1 en clase Mora):\n")
##
## ✓ MODELO RECOMENDADO (mayor F1 en clase Mora):
cat(" →", tab_comp$Modelo[mejor_idx], "\n")
## → Logística (original, u=0.50)
cat(sprintf(" Accuracy=%.4f | F1=%.4f | Kappa=%.4f | Recall=%.4f\n",
tab_comp$Accuracy[mejor_idx],
tab_comp$F1[mejor_idx],
tab_comp$Kappa[mejor_idx],
tab_comp$Recall[mejor_idx]))
## Accuracy=0.9960 | F1=0.8885 | Kappa=0.8864 | Recall=0.8649
# Gráfico comparativo
p_comp <- tab_comp %>%
select(Modelo, Accuracy, Precision, Recall, F1, Kappa) %>%
pivot_longer(-Modelo, names_to = "Metrica", values_to = "Valor") %>%
filter(!is.na(Valor)) %>%
ggplot(aes(x = Metrica, y = Valor, fill = Modelo)) +
geom_col(position = "dodge", alpha = 0.85) +
geom_text(aes(label = round(Valor, 3)),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 2.7, fontface = "bold") +
scale_fill_manual(values = c(C1, C4, C3, C5)) +
scale_y_continuous(limits = c(0, 1.15), labels = percent) +
labs(title = "Comparación de Modelos de Clasificación",
subtitle = "Evaluación en test original (datos sin balancear)",
x = NULL, y = "Valor") +
tema_base +
theme(legend.position = "right",
legend.text = element_text(size = 8))
print(p_comp)

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 11 ▸ PANEL GRÁFICO CONSOLIDADO
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 11 — PANEL GRÁFICO CONSOLIDADO\n")
## SECCIÓN 11 — PANEL GRÁFICO CONSOLIDADO
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
# Curvas ROC superpuestas
p_roc_all <- bind_rows(
roc_df %>% mutate(Modelo = "Logística original"),
roc_rose %>% mutate(Modelo = "Logística ROSE")
) %>%
ggplot(aes(x = fpr, y = tpr, color = Modelo)) +
geom_line(linewidth = 1.2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray60") +
annotate("text", x = 0.55, y = 0.12,
label = sprintf("AUC orig = %.4f\nAUC ROSE = %.4f",
auc_logit, auc_rose),
size = 3.8, fontface = "bold", color = "gray30") +
scale_color_manual(values = c("Logística original" = C1,
"Logística ROSE" = C4)) +
scale_x_continuous(labels = percent) +
scale_y_continuous(labels = percent) +
labs(title = "Curvas ROC — Comparación",
x = "FPR", y = "TPR") +
tema_base
panel_final <- (p_target | p_bkt) /
(p_roc_all | p_knn_k) /
(p_lm | p_comp) +
plot_annotation(
title = "Panel Consolidado — Paso 4: Modelado Predictivo I2C",
subtitle = paste0(
formatC(nrow(df_work), big.mark=","), " registros RV · ",
"265 clientes · Logística + k-NN + Regresión · ",
format(Sys.time(), "%d/%m/%Y")
),
caption = "Fuente: Base_Modelado_I2C_Lista_1.xlsx | Solo facturas RV",
theme = theme(
plot.title = element_text(face = "bold", size = 15, color = C1),
plot.subtitle = element_text(size = 10, color = "gray40"),
plot.caption = element_text(size = 8, color = "gray55")
)
)
print(panel_final)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 12 ▸ EXPORTACIÓN DE RESULTADOS
# ───────────────────────────────────────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════\n")
##
## ═══════════════════════════════════════════════════════
cat(" SECCIÓN 12 — EXPORTACIÓN\n")
## SECCIÓN 12 — EXPORTACIÓN
cat("═══════════════════════════════════════════════════════\n\n")
## ═══════════════════════════════════════════════════════
dir.create("output_modelado", recursive = TRUE, showWarnings = FALSE)
# Predicciones del mejor modelo en todo el dataset de trabajo
prob_final <- predict(modelo_logit_rose_step,
newdata = df_work, type = "response")
clase_final <- factor(if_else(prob_final >= umbral_optimo, "Mora", "Oportuno"),
levels = c("Mora", "Oportuno"))
df_pred_export <- df_work %>%
select(anon_customer_id, anon_document_id, document_date,
terms_of_payment, amount_in_local_currency,
arrears_calc, pago_oportuno_bin, bucket_mora_calc,
prom_mora_cliente, freq_mora_cliente) %>%
mutate(
prob_mora = round(prob_final, 4),
pred_clase = clase_final,
clasificacion_ok = if_else(pred_clase == pago_oportuno_bin,
"Correcta", "Error"),
nivel_riesgo = case_when(
prob_final >= 0.70 ~ "RIESGO ALTO",
prob_final >= 0.40 ~ "RIESGO MEDIO",
prob_final >= 0.20 ~ "RIESGO BAJO",
TRUE ~ "SIN RIESGO"
)
)
# Exportar CSVs
write.csv(df_pred_export, "output_modelado/predicciones_modelo_final.csv",
row.names = FALSE, fileEncoding = "UTF-8")
cat("✓ Predicciones exportadas\n")
## ✓ Predicciones exportadas
write.csv(tab_comp, "output_modelado/tabla_comparacion_modelos.csv",
row.names = FALSE)
cat("✓ Tabla comparativa exportada\n")
## ✓ Tabla comparativa exportada
coef_export <- data.frame(
Variable = names(coef(modelo_logit_rose_step)),
Coeficiente= round(coef(modelo_logit_rose_step), 4),
Odds_Ratio = round(exp(coef(modelo_logit_rose_step)), 4)
)
write.csv(coef_export, "output_modelado/coeficientes_logistica_rose.csv",
row.names = FALSE)
cat("✓ Coeficientes exportados\n")
## ✓ Coeficientes exportados
# Guardar gráficos
ggsave("output_modelado/panel_consolidado.png",
plot = panel_final, width = 20, height = 16, dpi = 150, bg = "white")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave("output_modelado/comparacion_modelos.png",
plot = p_comp, width = 12, height = 6, dpi = 150, bg = "white")
ggsave("output_modelado/curvas_roc.png",
plot = p_roc_all, width = 8, height = 6, dpi = 150, bg = "white")
ggsave("output_modelado/knn_k_selection.png",
plot = p_knn_k, width = 9, height = 5, dpi = 150, bg = "white")
ggsave("output_modelado/regresion_real_vs_predicho.png",
plot = p_lm, width = 8, height = 6, dpi = 150, bg = "white")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
cat("✓ Gráficos exportados en ./output_modelado/\n")
## ✓ Gráficos exportados en ./output_modelado/
# ── Resumen final ─────────────────────────────────────────────────────────────
cat("\n", strrep("═", 60), "\n", sep="")
##
## ════════════════════════════════════════════════════════════
cat(" RESUMEN FINAL — PASO 4 MODELADO PREDICTIVO I2C\n")
## RESUMEN FINAL — PASO 4 MODELADO PREDICTIVO I2C
cat(strrep("═", 60), "\n\n", sep="")
## ════════════════════════════════════════════════════════════
cat(" DATASET\n")
## DATASET
cat(sprintf(" Registros : %s (solo RV)\n",
formatC(nrow(df_work), big.mark = ",")))
## Registros : 141,638 (solo RV)
cat(sprintf(" Clientes : %d\n", n_distinct(df$anon_customer_id)))
## Clientes : 265
cat(sprintf(" Target 0(Mora): %s (%.1f%%)\n",
formatC(sum(df_work$pago_oportuno_bin=="Mora"), big.mark=","),
mean(df_work$pago_oportuno_bin=="Mora")*100))
## Target 0(Mora): 2,742 (1.9%)
cat(sprintf(" Target 1(Op.) : %s (%.1f%%)\n\n",
formatC(sum(df_work$pago_oportuno_bin=="Oportuno"), big.mark=","),
mean(df_work$pago_oportuno_bin=="Oportuno")*100))
## Target 1(Op.) : 138,896 (98.1%)
cat(" CLASIFICACIÓN (Pago_Oportuno_Bin — clase positiva: Mora)\n")
## CLASIFICACIÓN (Pago_Oportuno_Bin — clase positiva: Mora)
cat(" ┌──────────────────────────────┬──────────┬──────────┬──────────┬──────────┐\n")
## ┌──────────────────────────────┬──────────┬──────────┬──────────┬──────────┐
cat(" │ Modelo │ Accuracy │ F1 │ Kappa │ AUC │\n")
## │ Modelo │ Accuracy │ F1 │ Kappa │ AUC │
cat(" ├──────────────────────────────┼──────────┼──────────┼──────────┼──────────┤\n")
## ├──────────────────────────────┼──────────┼──────────┼──────────┼──────────┤
for (i in seq_len(nrow(tab_comp))) {
cat(sprintf(" │ %-28s │ %.4f │ %.4f │ %.4f │ %s │\n",
substr(tab_comp$Modelo[i], 1, 28),
tab_comp$Accuracy[i],
ifelse(is.na(tab_comp$F1[i]), 0, tab_comp$F1[i]),
tab_comp$Kappa[i],
ifelse(is.na(tab_comp$AUC[i]), " N/A ", sprintf("%.4f", tab_comp$AUC[i]))))
}
## │ Logística (original, u=0.50 │ 0.9960 │ 0.8885 │ 0.8864 │ 0.0008 │
## │ Logística (ROSE, u=0.50 │ 0.9760 │ 0.6075 │ 0.5970 │ 0.9966 │
## │ Logística (ROSE, u=0.97 │ 0.9903 │ 0.7540 │ 0.7491 │ 0.9966 │
## │ k-NN (SMOTE, K=1) │ 0.9810 │ 0.6535 │ 0.6448 │ N/A │
cat(" └──────────────────────────────┴──────────┴──────────┴──────────┴──────────┘\n")
## └──────────────────────────────┴──────────┴──────────┴──────────┴──────────┘
cat("\n REGRESIÓN LINEAL (Arrears_calc)\n")
##
## REGRESIÓN LINEAL (Arrears_calc)
cat(sprintf(" RMSE Test : %.4f días\n", rmse_te))
## RMSE Test : 8.8456 días
cat(sprintf(" MAE Test : %.4f días\n", mae_te))
## MAE Test : 1.5695 días
cat(sprintf(" R² Test : %.6f\n", r2_te))
## R² Test : 0.406928
cat(sprintf("\n MODELO RECOMENDADO: %s\n", tab_comp$Modelo[mejor_idx]))
##
## MODELO RECOMENDADO: Logística (original, u=0.50)
cat(sprintf(" Umbral de decisión: %.2f\n", umbral_optimo))
## Umbral de decisión: 0.97
cat("\n Archivos generados en ./output_modelado/\n")
##
## Archivos generados en ./output_modelado/
cat(strrep("═", 60), "\n", sep="")
## ════════════════════════════════════════════════════════════
cat("✅ PASO 4 COMPLETADO\n\n")
## ✅ PASO 4 COMPLETADO
# ═══════════════════════════════════════════════════════════════════════════════
# GUÍA DE INTERPRETACIÓN
# ═══════════════════════════════════════════════════════════════════════════════
#
# REGRESIÓN LINEAL (Arrears_calc)
# ─────────────────────────────────────────────────────────────────────────────
# • RMSE en días: error promedio del modelo en la predicción de días de atraso.
# Un RMSE de 5 días significa que el modelo se equivoca en promedio ±5 días.
# • R² ≈ 0: indica que las variables disponibles explican poco la variabilidad
# de arrears. Esto es esperado porque el 98% tiene arrears ≤ 0 (al día).
# • Coeficiente positivo en prom_mora_cliente → clientes con historial de mora
# alto tienden a acumular más días de atraso.
# • DW ≈ 2 → residuos sin autocorrelación (modelo bien especificado).
# • BIC mínimo → equilibrio óptimo entre ajuste y complejidad del modelo.
#
# REGRESIÓN LOGÍSTICA (Pago_Oportuno_Bin)
# ─────────────────────────────────────────────────────────────────────────────
# • OR > 1: la variable aumenta la probabilidad de MORA.
# • OR < 1: la variable disminuye la probabilidad de mora (protector).
# • AUC > 0.7 = modelo útil; > 0.8 = muy bueno; > 0.9 = excelente.
# • El umbral óptimo (umbral_optimo) maximiza F1 en la clase Mora.
# → Un umbral bajo captura más casos de mora (mayor Recall) a costa
# de más falsos positivos (menor Precision).
#
# k-NN
# ─────────────────────────────────────────────────────────────────────────────
# • La normalización min-max es obligatoria: sin ella las variables de mayor
# escala (log_monto) dominarían las distancias.
# • K pequeño → sobreajuste; K grande → subajuste. El K óptimo se elige
# por mayor Kappa (métrica robusta al desbalance severo).
# • El modelo se entrena con SMOTE y se evalúa en test ORIGINAL.
#
# BALANCEO DE CLASES (desbalance 98,1% / 1,9%)
# ─────────────────────────────────────────────────────────────────────────────
# • Sin balanceo, el modelo predice casi siempre "Oportuno" y obtiene
# Accuracy ≈ 98% pero Recall(Mora) ≈ 0% → completamente inútil para
# detectar mora.
# • ROSE y SMOTE sintetizan casos de mora para que el modelo los aprenda.
# • Evaluar SIEMPRE en el test original (sin balancear) para medir el
# rendimiento real en producción.
# • Kappa y F1 son las métricas correctas con desbalance severo.
# Accuracy sola es engañosa.
#
# SELECCIÓN DEL MODELO
# ─────────────────────────────────────────────────────────────────────────────
# • Priorizar el modelo con mayor F1(Mora) y Kappa.
# • Si el negocio penaliza más los falsos negativos (mora no detectada),
# bajar el umbral para maximizar Recall a costa de Precision.
# • Si el negocio penaliza los falsos positivos (alertas innecesarias),
# subir el umbral para maximizar Precision.
# ═══════════════════════════════════════════════════════════════════════════════
# ═══════════════════════════════════════════════════════════════════════════════
# SECCIÓN 11 — GRÁFICOS INDIVIDUALES + PANEL CONSOLIDADO (MEJORADO)
# Ejecutar después de completar las Secciones 1–10 del script principal.
# Genera cada gráfico por separado (legibles) y un panel final limpio.
# ═══════════════════════════════════════════════════════════════════════════════
# REQUISITO: tener en el entorno los objetos creados en las secciones anteriores:
# df_work, tab_comp, roc_df, roc_rose, auc_logit, auc_rose,
# umbral_optimo, k_opt, k_vals, acc_knn, f1_knn_v, kap_knn,
# rmse_te, r2_te, pred_lm_test, df_test, prob_rose, C1, C2, C3, C4, C5
# ═══════════════════════════════════════════════════════════════════════════════
library(ggplot2)
library(patchwork)
library(dplyr)
library(tidyr)
library(scales)
dir.create("output_modelado/graficos", recursive = TRUE, showWarnings = FALSE)
# ── Tema refinado para gráficos individuales ──────────────────────────────────
tema_plot <- theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, color = "#1B3A6B",
margin = margin(b = 6)),
plot.subtitle = element_text(size = 11, color = "#555555",
margin = margin(b = 10)),
plot.caption = element_text(size = 9, color = "#999999",
margin = margin(t = 8)),
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA),
panel.grid.major = element_line(color = "#EEEEEE", linewidth = 0.5),
panel.grid.minor = element_blank(),
axis.title = element_text(size = 11, color = "#333333"),
axis.text = element_text(size = 10, color = "#444444"),
legend.position = "bottom",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 10),
legend.background = element_rect(fill = "white", color = NA),
plot.margin = margin(16, 16, 12, 16)
)
C1 <- "#1B3A6B"; C2 <- "#E84040"; C3 <- "#F5A623"; C4 <- "#27AE60"; C5 <- "#8E44AD"
PALETA_BUCKET <- c(
"Al dia" = C4, "1-30" = "#5BA85A",
"31-60" = C3, "61-90" = "#D4891A",
"91-180" = "#C05010", "181-360" = C2, ">360" = "#7B0000"
)
# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 1 — Variable objetivo: Pago_Oportuno_Bin
# ═══════════════════════════════════════════════════════════════════════════════
tab_target <- df_work %>%
count(pago_oportuno_bin) %>%
mutate(
pct = round(n / sum(n) * 100, 1),
label = paste0(comma(n), "\n(", pct, "%)")
)
g1 <- ggplot(tab_target,
aes(x = pago_oportuno_bin, y = n, fill = pago_oportuno_bin)) +
geom_col(width = 0.55, alpha = 0.92, show.legend = FALSE) +
geom_text(aes(label = label), vjust = -0.4,
size = 4.5, fontface = "bold", color = "#222222") +
scale_fill_manual(values = c("Mora" = C2, "Oportuno" = C4)) +
scale_y_continuous(labels = comma,
expand = expansion(mult = c(0, 0.18))) +
labs(
title = "Variable Objetivo: Pago_Oportuno_Bin",
subtitle = "Desbalance severo — clave para estrategia de balanceo",
x = NULL,
y = "Número de registros",
caption = "Clase positiva de interés: Mora (0)"
) +
tema_plot +
theme(axis.text.x = element_text(size = 12, face = "bold"))
print(g1)

ggsave("output_modelado/graficos/G1_variable_objetivo.png",
g1, width = 7, height = 5.5, dpi = 180, bg = "white")
cat("✓ G1 guardado\n")
## ✓ G1 guardado
# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 2 — Distribución Bucket_Mora
# ═══════════════════════════════════════════════════════════════════════════════
tab_bkt <- df_work %>%
filter(!is.na(bucket_mora_calc)) %>%
count(bucket_mora_calc) %>%
mutate(
pct = round(n / sum(n) * 100, 2),
bucket_mora_calc = factor(bucket_mora_calc,
levels = c("Al dia","1-30","31-60","61-90","91-180","181-360",">360"))
)
g2 <- ggplot(tab_bkt,
aes(x = bucket_mora_calc, y = n, fill = bucket_mora_calc)) +
geom_col(width = 0.7, alpha = 0.92, show.legend = FALSE) +
geom_text(aes(label = paste0(pct, "%")),
vjust = -0.5, size = 4, fontface = "bold", color = "#222222") +
scale_fill_manual(values = PALETA_BUCKET) +
scale_y_continuous(labels = comma,
expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Distribución por Bucket_Mora",
subtitle = "Tramos de mora según días de atraso (Arrears_calc)",
x = "Tramo de mora",
y = "Registros",
caption = "Calculado desde Arrears_calc recalculado"
) +
tema_plot
print(g2)

ggsave("output_modelado/graficos/G2_bucket_mora.png",
g2, width = 8, height = 5.5, dpi = 180, bg = "white")
cat("✓ G2 guardado\n")
## ✓ G2 guardado
# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 3 — Curvas ROC superpuestas
# ═══════════════════════════════════════════════════════════════════════════════
roc_combinado <- bind_rows(
roc_df %>% select(fpr, tpr) %>% mutate(Modelo = "Logística original"),
roc_rose %>% select(fpr, tpr) %>% mutate(Modelo = "Logística + ROSE")
)
g3 <- ggplot(roc_combinado, aes(x = fpr, y = tpr, color = Modelo)) +
# Área bajo la curva (relleno suave)
geom_ribbon(data = roc_df %>% mutate(Modelo = "Logística original"),
aes(ymin = 0, ymax = tpr, x = fpr),
fill = C1, alpha = 0.08, inherit.aes = FALSE) +
geom_ribbon(data = roc_rose %>% mutate(Modelo = "Logística + ROSE"),
aes(ymin = 0, ymax = tpr, x = fpr),
fill = C4, alpha = 0.08, inherit.aes = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed",
color = "gray70", linewidth = 0.8) +
geom_line(linewidth = 1.6) +
# Etiquetas AUC con caja de fondo
annotate("label", x = 0.60, y = 0.22,
label = sprintf("AUC Original = %.4f", auc_logit),
color = C1, fill = "white", size = 4, fontface = "bold",
label.padding = unit(0.4, "lines"), label.r = unit(0.2, "lines")) +
annotate("label", x = 0.60, y = 0.10,
label = sprintf("AUC ROSE = %.4f", auc_rose),
color = C4, fill = "white", size = 4, fontface = "bold",
label.padding = unit(0.4, "lines"), label.r = unit(0.2, "lines")) +
scale_color_manual(values = c("Logística original" = C1,
"Logística + ROSE" = C4),
name = "Modelo") +
scale_x_continuous(labels = percent, limits = c(0, 1),
breaks = seq(0, 1, 0.25)) +
scale_y_continuous(labels = percent, limits = c(0, 1),
breaks = seq(0, 1, 0.25)) +
labs(
title = "Curvas ROC — Comparación de Modelos",
subtitle = "Clase positiva: Mora | Test original sin balancear",
x = "Tasa de Falsos Positivos (1 – Especificidad)",
y = "Tasa de Verdaderos Positivos (Sensibilidad)",
caption = "Línea diagonal = clasificador aleatorio (AUC = 0.5)"
) +
tema_plot +
theme(legend.position = c(0.72, 0.18),
legend.background = element_rect(fill = "white", color = "#DDDDDD",
linewidth = 0.4))
print(g3)

ggsave("output_modelado/graficos/G3_curvas_ROC.png",
g3, width = 7.5, height = 6.5, dpi = 180, bg = "white")
cat("✓ G3 guardado\n")
## ✓ G3 guardado
# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 4 — k-NN: métricas por valor de K
# ═══════════════════════════════════════════════════════════════════════════════
df_knn_metricas <- data.frame(
K = rep(k_vals, 3),
Valor = c(acc_knn,
replace(f1_knn_v, is.na(f1_knn_v), 0),
kap_knn),
Metrica = rep(c("Accuracy", "F1 (Mora)", "Kappa"), each = length(k_vals))
)
g4 <- ggplot(df_knn_metricas,
aes(x = K, y = Valor, color = Metrica, group = Metrica)) +
geom_line(linewidth = 1.4) +
geom_point(size = 4, stroke = 1.5, shape = 21,
aes(fill = Metrica), color = "white") +
geom_vline(xintercept = k_opt, linetype = "dashed",
color = C3, linewidth = 1) +
annotate("label", x = k_opt, y = max(acc_knn) * 1.01,
label = paste0("K óptimo = ", k_opt),
color = C3, fill = "white", size = 4, fontface = "bold",
label.padding = unit(0.4,"lines"),
label.r = unit(0.2,"lines")) +
scale_color_manual(values = c("Accuracy" = C1, "F1 (Mora)" = C4, "Kappa" = C5),
name = "Métrica") +
scale_fill_manual(values = c("Accuracy" = C1, "F1 (Mora)" = C4, "Kappa" = C5),
guide = "none") +
scale_x_continuous(breaks = k_vals) +
scale_y_continuous(labels = percent,
breaks = seq(0, 1, 0.1),
limits = c(0, max(acc_knn) * 1.06)) +
labs(
title = "k-NN: Métricas por Valor de K",
subtitle = "Entrenado con SMOTE · Evaluado en test original",
x = "Número de vecinos (K)",
y = "Valor de la métrica",
caption = paste0("K óptimo seleccionado por mayor Kappa | ",
"Datos SMOTE (train) → Test original (sin balancear)")
) +
tema_plot
print(g4)

ggsave("output_modelado/graficos/G4_kNN_metricas_K.png",
g4, width = 8, height = 5.5, dpi = 180, bg = "white")
cat("✓ G4 guardado\n")
## ✓ G4 guardado
# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 5 — Regresión lineal: Real vs Predicho
# ═══════════════════════════════════════════════════════════════════════════════
# Construir dataframe alineado (excluir NAs del test)
df_lm_plot <- data.frame(
real = df_test$arrears_calc,
pred = pred_lm_test
) %>% drop_na()
# Muestra para rendimiento (máx 15.000 puntos)
set.seed(42)
df_lm_sample <- df_lm_plot %>%
slice_sample(n = min(15000, nrow(df_lm_plot)))
# Límites simétricos p1-p99
lim_inf <- quantile(df_lm_plot$real, 0.01, na.rm = TRUE)
lim_sup <- quantile(df_lm_plot$real, 0.99, na.rm = TRUE)
g5 <- ggplot(df_lm_sample, aes(x = real, y = pred)) +
geom_point(alpha = 0.12, size = 0.8, color = C1) +
geom_abline(slope = 1, intercept = 0,
color = C2, linetype = "dashed", linewidth = 1.2) +
geom_smooth(method = "lm", color = C4, linewidth = 1,
se = TRUE, alpha = 0.15, linetype = "solid") +
coord_cartesian(xlim = c(lim_inf, lim_sup),
ylim = c(lim_inf, lim_sup)) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
annotate("label", x = lim_inf + (lim_sup - lim_inf) * 0.04,
y = lim_sup * 0.95,
label = sprintf("RMSE = %.4f días\nR² = %.6f", rmse_te, r2_te),
hjust = 0, size = 4, fontface = "bold", color = "#222222",
fill = "white", label.padding = unit(0.5,"lines"),
label.r = unit(0.2,"lines")) +
labs(
title = "Regresión Lineal: Arrears Real vs Predicho",
subtitle = "Muestra aleatoria p1–p99 | Stepwise AIC",
x = "Arrears real (días de atraso)",
y = "Arrears predicho (días de atraso)",
caption = "Línea roja = predicción perfecta | Línea verde = tendencia real (OLS)"
) +
tema_plot
print(g5)
## `geom_smooth()` using formula = 'y ~ x'

ggsave("output_modelado/graficos/G5_regresion_real_vs_predicho.png",
g5, width = 7.5, height = 6.5, dpi = 180, bg = "white")
## `geom_smooth()` using formula = 'y ~ x'
cat("✓ G5 guardado\n")
## ✓ G5 guardado
# ═══════════════════════════════════════════════════════════════════════════════
# GRÁFICO 6 — Comparación de modelos de clasificación
# ═══════════════════════════════════════════════════════════════════════════════
# Tabla limpia: nombre corto para visualización
tab_comp_plot <- tab_comp %>%
mutate(
Modelo_corto = case_when(
grepl("original.*0.50", Modelo) ~ "Logística\noriginal\nu=0.50",
grepl("ROSE.*0.50", Modelo) ~ "Logística\nROSE\nu=0.50",
grepl("ROSE.*0\\.[0-9]", Modelo) ~ paste0("Logística\nROSE\nu=", umbral_optimo),
grepl("k-NN", Modelo) ~ paste0("k-NN\n(SMOTE)\nK=", k_opt),
TRUE ~ Modelo
)
) %>%
select(Modelo_corto, Accuracy, Precision, Recall, F1, Kappa) %>%
pivot_longer(-Modelo_corto, names_to = "Metrica", values_to = "Valor") %>%
filter(!is.na(Valor)) %>%
mutate(
Metrica = factor(Metrica,
levels = c("Accuracy","Precision","Recall","F1","Kappa")),
Modelo_corto = factor(Modelo_corto,
levels = unique(Modelo_corto))
)
# Colores por modelo
colores_modelo <- setNames(
c(C1, C4, C3, C5),
levels(tab_comp_plot$Modelo_corto)
)
g6 <- ggplot(tab_comp_plot,
aes(x = Metrica, y = Valor, fill = Modelo_corto)) +
geom_col(position = position_dodge(width = 0.78),
alpha = 0.90, width = 0.72) +
geom_text(aes(label = sprintf("%.3f", Valor)),
position = position_dodge(width = 0.78),
vjust = -0.45, size = 3.2, fontface = "bold",
color = "#222222") +
geom_hline(yintercept = c(0.25, 0.50, 0.75),
color = "#DDDDDD", linewidth = 0.4, linetype = "solid") +
scale_fill_manual(values = colores_modelo, name = "Modelo") +
scale_y_continuous(limits = c(0, 1.12),
breaks = seq(0, 1, 0.25),
labels = percent_format(accuracy = 1)) +
labs(
title = "Comparación de Modelos de Clasificación",
subtitle = "Clase positiva: Mora | Evaluación en conjunto de prueba (test original)",
x = NULL,
y = "Valor de la métrica",
caption = paste0("Accuracy alta no implica buen modelo con desbalance severo.",
"\nPriorizar F1 y Kappa en clase Mora.")
) +
tema_plot +
theme(
legend.position = "bottom",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9),
axis.text.x = element_text(size = 11, face = "bold", color = "#1B3A6B")
) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE))
print(g6)

ggsave("output_modelado/graficos/G6_comparacion_modelos.png",
g6, width = 10, height = 6.5, dpi = 180, bg = "white")
cat("✓ G6 guardado\n")
## ✓ G6 guardado
# ═══════════════════════════════════════════════════════════════════════════════
# PANEL FINAL CONSOLIDADO — diseño limpio 3 × 2
# ═══════════════════════════════════════════════════════════════════════════════
# Tema más compacto para el panel
tema_panel <- theme_minimal(base_size = 10) +
theme(
plot.title = element_text(face = "bold", size = 11, color = "#1B3A6B"),
plot.subtitle = element_text(size = 8.5, color = "#666666"),
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA),
panel.grid.major = element_line(color = "#F0F0F0", linewidth = 0.4),
panel.grid.minor = element_blank(),
axis.title = element_text(size = 9),
axis.text = element_text(size = 8.5),
legend.position = "bottom",
legend.text = element_text(size = 8),
legend.title = element_text(size = 8.5, face = "bold"),
legend.key.size = unit(0.45, "cm"),
plot.margin = margin(10, 10, 8, 10)
)
# Versiones compactas de cada gráfico ─────────────────────────────────────────
# P1 — Objetivo
p1 <- ggplot(tab_target,
aes(x = pago_oportuno_bin, y = n, fill = pago_oportuno_bin)) +
geom_col(width = 0.5, alpha = 0.9, show.legend = FALSE) +
geom_text(aes(label = paste0("(", pct, "%)")),
vjust = -0.4, size = 3.5, fontface = "bold") +
scale_fill_manual(values = c("Mora" = C2, "Oportuno" = C4)) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.18))) +
labs(title = "Variable Objetivo", subtitle = "Desbalance: 98,1% vs 1,9%",
x = NULL, y = "Registros") +
tema_panel
# P2 — Bucket
p2 <- ggplot(tab_bkt, aes(x = bucket_mora_calc, y = n, fill = bucket_mora_calc)) +
geom_col(width = 0.7, alpha = 0.9, show.legend = FALSE) +
geom_text(aes(label = paste0(pct, "%")), vjust = -0.4, size = 3, fontface = "bold") +
scale_fill_manual(values = PALETA_BUCKET) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.18))) +
labs(title = "Distribución Bucket_Mora", subtitle = "Tramos de mora por días de atraso",
x = NULL, y = "Registros") +
tema_panel +
theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
# P3 — ROC
p3 <- ggplot(roc_combinado, aes(x = fpr, y = tpr, color = Modelo)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed",
color = "gray75", linewidth = 0.7) +
geom_line(linewidth = 1.3) +
annotate("text", x = 0.55, y = 0.18,
label = sprintf("AUC orig: %.4f\nAUC ROSE: %.4f", auc_logit, auc_rose),
size = 3.2, fontface = "bold", color = "#333333") +
scale_color_manual(values = c("Logística original" = C1,
"Logística + ROSE" = C4), name = NULL) +
scale_x_continuous(labels = percent, breaks = c(0, 0.5, 1)) +
scale_y_continuous(labels = percent, breaks = c(0, 0.5, 1)) +
labs(title = "Curvas ROC", subtitle = "Clase positiva: Mora",
x = "FPR", y = "TPR") +
tema_panel +
theme(legend.position = c(0.65, 0.15))
# P4 — kNN
p4 <- ggplot(df_knn_metricas,
aes(x = K, y = Valor, color = Metrica, group = Metrica)) +
geom_line(linewidth = 1.2) +
geom_point(size = 3) +
geom_vline(xintercept = k_opt, linetype = "dashed", color = C3, linewidth = 0.9) +
annotate("text", x = k_opt + 0.5, y = min(kap_knn),
label = paste0("K=", k_opt), color = C3, size = 3.5, fontface = "bold") +
scale_color_manual(values = c("Accuracy" = C1, "F1 (Mora)" = C4, "Kappa" = C5),
name = NULL) +
scale_x_continuous(breaks = k_vals) +
scale_y_continuous(labels = percent) +
labs(title = paste0("k-NN: Métricas vs K (K óptimo = ", k_opt, ")"),
subtitle = "Train: SMOTE · Test: original",
x = "K vecinos", y = "Valor") +
tema_panel
# P5 — Regresión real vs predicho
p5 <- ggplot(df_lm_sample %>% filter(!is.na(real), !is.na(pred)),
aes(x = real, y = pred)) +
geom_point(alpha = 0.10, size = 0.6, color = C1) +
geom_abline(slope = 1, intercept = 0, color = C2,
linetype = "dashed", linewidth = 1) +
geom_smooth(method = "lm", color = C4, linewidth = 0.9,
se = FALSE) +
coord_cartesian(xlim = c(lim_inf, lim_sup),
ylim = c(lim_inf, lim_sup)) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
annotate("label", x = lim_inf + (lim_sup-lim_inf)*0.03,
y = lim_sup * 0.94,
label = sprintf("RMSE=%.2f\nR²=%.4f", rmse_te, r2_te),
hjust = 0, size = 3.2, fill = "white",
label.padding = unit(0.3,"lines")) +
labs(title = "Regresión Lineal: Real vs Predicho",
subtitle = "Arrears_calc (días de atraso)",
x = "Real", y = "Predicho") +
tema_panel
# P6 — Comparación modelos (versión compacta)
tab_comp_compact <- tab_comp_plot %>%
filter(Metrica %in% c("Accuracy","F1","Kappa"))
colores_panel <- setNames(
c(C1, C4, C3, C5),
levels(tab_comp_plot$Modelo_corto)
)
p6 <- ggplot(tab_comp_compact,
aes(x = Metrica, y = Valor, fill = Modelo_corto)) +
geom_col(position = position_dodge(width = 0.75),
alpha = 0.88, width = 0.70) +
geom_text(aes(label = sprintf("%.2f", Valor)),
position = position_dodge(width = 0.75),
vjust = -0.4, size = 2.8, fontface = "bold") +
scale_fill_manual(values = colores_panel, name = NULL) +
scale_y_continuous(limits = c(0, 1.18),
labels = percent_format(accuracy = 1)) +
labs(title = "Comparación de Modelos",
subtitle = "Evaluación en test original",
x = NULL, y = "Valor") +
tema_panel +
theme(
axis.text.x = element_text(size = 9, face = "bold"),
legend.position = "bottom"
) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE,
override.aes = list(size = 0.5)))
# ── Ensamblar panel final ──────────────────────────────────────────────────────
panel_final <- (p1 | p2 | p3) / (p4 | p5 | p6) +
plot_annotation(
title = "Panel Consolidado — Paso 4: Modelado Predictivo I2C",
subtitle = paste0(
formatC(nrow(df_work), big.mark = ","),
" registros RV · 265 clientes · Logística + k-NN + Regresión Lineal · ",
format(Sys.Date(), "%d/%m/%Y")
),
caption = "Fuente: Base_Modelado_I2C_Lista_1.xlsx | Solo facturas RV | CRISP-DM Paso 4",
theme = theme(
plot.title = element_text(face = "bold", size = 16, color = "#1B3A6B",
margin = margin(b = 4)),
plot.subtitle = element_text(size = 11, color = "#555555",
margin = margin(b = 8)),
plot.caption = element_text(size = 9, color = "#999999",
margin = margin(t = 8)),
plot.background = element_rect(fill = "white", color = NA),
plot.margin = margin(16, 16, 12, 16)
)
)
print(panel_final)
## `geom_smooth()` using formula = 'y ~ x'

ggsave("output_modelado/graficos/Panel_Consolidado_Final.png",
panel_final,
width = 22,
height = 14,
dpi = 180,
bg = "white")
## `geom_smooth()` using formula = 'y ~ x'
cat("\n✓ Panel consolidado exportado: output_modelado/graficos/Panel_Consolidado_Final.png\n")
##
## ✓ Panel consolidado exportado: output_modelado/graficos/Panel_Consolidado_Final.png
cat("\n── Gráficos individuales exportados ──────────────────\n")
##
## ── Gráficos individuales exportados ──────────────────
cat(" G1: Variable objetivo (Pago_Oportuno_Bin)\n")
## G1: Variable objetivo (Pago_Oportuno_Bin)
cat(" G2: Distribución Bucket_Mora\n")
## G2: Distribución Bucket_Mora
cat(" G3: Curvas ROC superpuestas\n")
## G3: Curvas ROC superpuestas
cat(" G4: k-NN métricas por K\n")
## G4: k-NN métricas por K
cat(" G5: Regresión lineal real vs predicho\n")
## G5: Regresión lineal real vs predicho
cat(" G6: Comparación de 4 modelos\n")
## G6: Comparación de 4 modelos
cat(" → Carpeta: output_modelado/graficos/\n\n")
## → Carpeta: output_modelado/graficos/
cat("✅ Sección 11 completada\n")
## ✅ Sección 11 completada