FINAL

# ═══════════════════════════════════════════════════════════════════════════════
#  ANÁLISIS EXPLORATORIO DE DATOS — BASE I2C / CARTERA
#  14 variables | 341 027 registros
#  Flujo: (1) Análisis general → (2) Variable por variable en orden original
# ═══════════════════════════════════════════════════════════════════════════════
#
#  Variables (en orden original de la base):
#   1.  Anon_Customer_ID              → Identificador de cliente      (ID)
#   2.  Anon_Document_ID              → Identificador de documento     (ID)
#   3.  Terms_of_Payment              → Condición de pago              (Categórica)
#   4.  Document_Type                 → Tipo de documento              (Categórica)
#   5.  Document_Date                 → Fecha del documento            (Fecha)
#   6.  Payment_date                  → Fecha de pago                  (Fecha)
#   7.  Net_due_date                  → Fecha límite de pago           (Fecha)
#   8.  Arrears_after_net_due_date    → Días de atraso                 (Numérica)
#   9.  Amount_in_local_currency      → Monto en moneda local          (Numérica)
#  10.  Reason_code                   → Código de razón                (Categórica/ID)
#  11.  Clearing_date                 → Fecha de compensación          (Fecha)
#  12.  Year/month                    → Período año/mes                (Categórica/Temporal)
#  13.  Estado_Cartera                → Abierta / Cerrada              (Categórica binaria)
#  14.  Bucket_Mora                   → Tramo de mora                  (Categórica ordenada)
#        (Pago_Oportuno_Bin eliminada por solicitud del usuario)
# ═══════════════════════════════════════════════════════════════════════════════


# ───────────────────────────────────────────────────────────────────────────────
# SECCIÓN 0 ▸ INSTALACIÓN Y CARGA DE LIBRERÍAS
# ───────────────────────────────────────────────────────────────────────────────

paquetes <- c(
  "tidyverse", "readxl", "janitor", "skimr", "moments",
  "ggplot2", "patchwork", "lubridate", "scales", "writexl"
)

for (pkg in paquetes) {
  if (!requireNamespace(pkg, quietly = TRUE))
    install.packages(pkg, repos = "https://cran.r-project.org")
  suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'dplyr' was built under R version 4.5.3

## Warning: package 'stringr' was built under R version 4.5.3

## Warning: package 'forcats' was built under R version 4.5.3

## Warning: package 'readxl' was built under R version 4.5.3

## Warning: package 'janitor' was built under R version 4.5.3

## Warning: package 'skimr' was built under R version 4.5.3

## Warning: package 'patchwork' was built under R version 4.5.3

## Warning: package 'scales' was built under R version 4.5.3

## Warning: package 'writexl' was built under R version 4.5.3

cat("✓ Librerías cargadas correctamente\n\n")

## ✓ Librerías cargadas correctamente

# ── Paleta visual consistente ──────────────────────────────────────────────────
C1 <- "#2C3E7A"   # azul oscuro (principal)
C2 <- "#E84040"   # rojo (alertas / secundario)
C3 <- "#F5A623"   # naranja (acento)
C4 <- "#27AE60"   # verde (positivo)
PALETA <- c(C1, C2, C3, C4, "#8E44AD", "#16A085", "#D35400", "#2980B9",
            "#C0392B", "#7F8C8D", "#1ABC9C", "#F39C12", "#884EA0", "#2E86C1")

# ── Tema ggplot ────────────────────────────────────────────────────────────────
tema <- theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", size = 13, color = C1),
    plot.subtitle = element_text(size = 10, color = "gray40"),
    axis.title    = element_text(size = 10),
    legend.position  = "bottom",
    panel.grid.minor = element_blank()
  )

# ── Funciones auxiliares ───────────────────────────────────────────────────────
moda_fn <- function(x) {
  x <- x[!is.na(x)]
  if (length(x) == 0) return(NA_character_)
  ux <- unique(x); as.character(ux[which.max(tabulate(match(x, ux)))])
}

cv_fn <- function(x) round(sd(x, na.rm=TRUE) / mean(x, na.rm=TRUE) * 100, 2)

outliers_iqr <- function(x) {
  q1 <- quantile(x, .25, na.rm=TRUE); q3 <- quantile(x, .75, na.rm=TRUE)
  iqr <- q3 - q1
  x[!is.na(x) & (x < q1 - 1.5*iqr | x > q3 + 1.5*iqr)]
}

lims_iqr <- function(x) {
  q1 <- quantile(x, .25, na.rm=TRUE); q3 <- quantile(x, .75, na.rm=TRUE)
  iqr <- q3 - q1; list(inf = q1 - 1.5*iqr, sup = q3 + 1.5*iqr)
}

separador <- function(titulo) {
  cat("\n", strrep("═", 65), "\n", sep="")
  cat(" ", toupper(titulo), "\n", sep="")
  cat(strrep("═", 65), "\n\n", sep="")
}

sub_sep <- function(txt) {
  cat("\n  ── ", txt, " ──\n", sep="")
}

imprimir_tabla <- function(df, titulo = NULL) {
  if (!is.null(titulo)) cat("\n ", titulo, "\n")
  print(df, n = Inf)
}


# ═══════════════════════════════════════════════════════════════════════════════
# PARTE 1 ▸ ANÁLISIS GENERAL DE LA BASE DE DATOS
# ═══════════════════════════════════════════════════════════════════════════════

separador("PARTE 1 — ANÁLISIS GENERAL DE LA BASE DE DATOS")

## 
## ═════════════════════════════════════════════════════════════════
##  PARTE 1 — ANÁLISIS GENERAL DE LA BASE DE DATOS
## ═════════════════════════════════════════════════════════════════

# ── 1.1 Carga de datos ────────────────────────────────────────────────────────
RUTA <- "C:/Users/jcabia01/Downloads/Tabla anonimización Ok.xlsx"

cat("Cargando archivo...\n")

## Cargando archivo...

df_raw <- read_excel(RUTA, sheet = 1, guess_max = 10000, col_types = "text")
cat("✓ Archivo cargado\n\n")

## ✓ Archivo cargado

# ── 1.2 Eliminar Pago_Oportuno_Bin ANTES de limpiar nombres ──────────────────
# Se elimina por posición (columna 14) para evitar dependencia del nombre exacto
# que genera janitor según la versión instalada.
cat("Columnas originales en el Excel:\n")

## Columnas originales en el Excel:

print(names(df_raw))

##  [1] "Anon_Customer_ID"                   "Anon_Document_ID"                  
##  [3] "Terms_of_Payment"                   "Document_Type"                     
##  [5] "Document_Date"                      "Payment_date"                      
##  [7] "Net_due_date"                       "Arrears_after_net_due_date"        
##  [9] "Amount_in_local_currency"           "Reason_code"                       
## [11] "Clearing_date"                      "Year/month"                        
## [13] "Estado_Cartera = Abierta / Cerrada" "Bucket_Mora"

# Identificar y eliminar la columna Pago_Oportuno_Bin de forma robusta
col_eliminar <- grep("pago|Pago|oportuno|Oportuno|Bin|bin", names(df_raw),
                     value = TRUE, ignore.case = TRUE)
if (length(col_eliminar) > 0) {
  cat("\nEliminando columna(s):", paste(col_eliminar, collapse=", "), "\n")
  df_raw <- df_raw %>% select(-all_of(col_eliminar))
} else {
  # Fallback: eliminar por posición 14 si no se encontró por nombre
  if (ncol(df_raw) >= 14) {
    cat("\nEliminando columna en posición 14:", names(df_raw)[14], "\n")
    df_raw <- df_raw[, -14]
  }
}

## 
## Eliminando columna en posición 14: Bucket_Mora

cat("Columnas restantes:", ncol(df_raw), "\n\n")

## Columnas restantes: 13

# ── 1.3 Limpieza de nombres de columnas ───────────────────────────────────────
df <- df_raw %>% clean_names()
cat("Nombres limpios generados:\n")

## Nombres limpios generados:

print(names(df))

##  [1] "anon_customer_id"               "anon_document_id"              
##  [3] "terms_of_payment"               "document_type"                 
##  [5] "document_date"                  "payment_date"                  
##  [7] "net_due_date"                   "arrears_after_net_due_date"    
##  [9] "amount_in_local_currency"       "reason_code"                   
## [11] "clearing_date"                  "year_month"                    
## [13] "estado_cartera_abierta_cerrada"

cat("\n")

# ── 1.4 Conversión controlada de tipos ───────────────────────────────────────
df <- df %>%
  mutate(
    # Identificadores → texto
    anon_customer_id = as.character(anon_customer_id),
    anon_document_id = as.character(anon_document_id),
    reason_code      = as.character(reason_code),

    # Categóricas → texto
    terms_of_payment = as.character(terms_of_payment),
    document_type    = as.character(document_type),
    year_month       = as.character(year_month),
    estado_cartera   = as.character(estado_cartera_abierta_cerrada),

    # Numéricas
    arrears = suppressWarnings(as.numeric(arrears_after_net_due_date)),
    amount  = suppressWarnings(as.numeric(amount_in_local_currency)),

    # Fechas (número serial de Excel → Date)
    document_date = suppressWarnings(
      as.Date(as.numeric(document_date), origin = "1899-12-30")),
    payment_date  = suppressWarnings(
      as.Date(as.numeric(payment_date),  origin = "1899-12-30")),
    net_due_date  = suppressWarnings(
      as.Date(as.numeric(net_due_date),  origin = "1899-12-30")),
    clearing_date = suppressWarnings(
      as.Date(as.numeric(clearing_date), origin = "1899-12-30")),

    # Bucket_Mora: recalculada desde arrears (venía como fórmula sin calcular)
    bucket_mora = case_when(
      arrears <= 0   ~ "Al dia",
      arrears <= 30  ~ "1-30",
      arrears <= 60  ~ "31-60",
      arrears <= 90  ~ "61-90",
      arrears <= 180 ~ "91-180",
      arrears <= 360 ~ "181-360",
      TRUE            ~ ">360"
    ),
    bucket_mora = factor(bucket_mora,
      levels = c("Al dia","1-30","31-60","61-90","91-180","181-360",">360"),
      ordered = TRUE)
  ) %>%
  # Renombrar columnas a nombres limpios definitivos (mismo orden que original)
  select(
    anon_customer_id,
    anon_document_id,
    terms_of_payment,
    document_type,
    document_date,
    payment_date,
    net_due_date,
    arrears,
    amount,
    reason_code,
    clearing_date,
    year_month,
    estado_cartera,
    bucket_mora
  )

cat("✓ Tipos de variables asignados y Bucket_Mora recalculada\n")

## ✓ Tipos de variables asignados y Bucket_Mora recalculada

cat("  (Nota: en el Excel original Bucket_Mora contenía fórmulas IF sin\n")

##   (Nota: en el Excel original Bucket_Mora contenía fórmulas IF sin

cat("   calcular; se reconstruyó directamente desde Arrears)\n\n")

##    calcular; se reconstruyó directamente desde Arrears)

# ── 1.3 Dimensiones ───────────────────────────────────────────────────────────
sub_sep("DIMENSIONES")

## 
##   ── DIMENSIONES ──

cat("  Número de filas    :", formatC(nrow(df), big.mark=","), "\n")

##   Número de filas    : 341,027

cat("  Número de columnas :", ncol(df), "\n")

##   Número de columnas : 14

# ── 1.4 Nombres de variables ───────────────────────────────────────────────────
sub_sep("NOMBRES DE VARIABLES (en orden original)")

## 
##   ── NOMBRES DE VARIABLES (en orden original) ──

for (i in seq_along(names(df))) {
  cat(sprintf("  %2d. %s\n", i, names(df)[i]))
}

##    1. anon_customer_id
##    2. anon_document_id
##    3. terms_of_payment
##    4. document_type
##    5. document_date
##    6. payment_date
##    7. net_due_date
##    8. arrears
##    9. amount
##   10. reason_code
##   11. clearing_date
##   12. year_month
##   13. estado_cartera
##   14. bucket_mora

# ── 1.5 Tipos de dato en R ────────────────────────────────────────────────────
sub_sep("TIPO DE DATO EN R POR VARIABLE")

## 
##   ── TIPO DE DATO EN R POR VARIABLE ──

tipos_df <- tibble(
  `#`       = seq_along(names(df)),
  Variable  = names(df),
  Tipo_R    = map_chr(df, ~ class(.x)[1]),
  Categoria = case_when(
    names(df) %in% c("anon_customer_id","anon_document_id") ~ "Identificador",
    names(df) %in% c("document_date","payment_date",
                     "net_due_date","clearing_date")         ~ "Fecha",
    names(df) %in% c("arrears","amount")                    ~ "Numérica continua",
    names(df) == "bucket_mora"                               ~ "Categórica ordenada",
    TRUE                                                      ~ "Categórica nominal"
  )
)
imprimir_tabla(tipos_df)

## # A tibble: 14 × 4
##      `#` Variable         Tipo_R    Categoria          
##    <int> <chr>            <chr>     <chr>              
##  1     1 anon_customer_id character Identificador      
##  2     2 anon_document_id character Identificador      
##  3     3 terms_of_payment character Categórica nominal 
##  4     4 document_type    character Categórica nominal 
##  5     5 document_date    Date      Fecha              
##  6     6 payment_date     Date      Fecha              
##  7     7 net_due_date     Date      Fecha              
##  8     8 arrears          numeric   Numérica continua  
##  9     9 amount           numeric   Numérica continua  
## 10    10 reason_code      character Categórica nominal 
## 11    11 clearing_date    Date      Fecha              
## 12    12 year_month       character Categórica nominal 
## 13    13 estado_cartera   character Categórica nominal 
## 14    14 bucket_mora      ordered   Categórica ordenada

# ── 1.6 Vista inicial ─────────────────────────────────────────────────────────
sub_sep("PRIMERAS 6 FILAS (head)")

## 
##   ── PRIMERAS 6 FILAS (head) ──

print(head(df))

## # A tibble: 6 × 14
##   anon_customer_id anon_document_id terms_of_payment document_type document_date
##   <chr>            <chr>            <chr>            <chr>         <date>       
## 1 CUST_8           XXXXXX2081       <NA>             AB            2023-03-31   
## 2 CUST_273         XXXXXX2080       <NA>             AB            2023-03-31   
## 3 CUST_158         XXXXXX1017       Z000             DA            2026-03-26   
## 4 CUST_233         XXXXXX0970       Z914             DA            2026-03-25   
## 5 CUST_280         XXXXXX0076       Z913             DA            2026-01-21   
## 6 CUST_280         XXXXXX0077       Z913             DA            2026-01-21   
## # ℹ 9 more variables: payment_date <date>, net_due_date <date>, arrears <dbl>,
## #   amount <dbl>, reason_code <chr>, clearing_date <date>, year_month <chr>,
## #   estado_cartera <chr>, bucket_mora <ord>

sub_sep("ESTRUCTURA (str)")

## 
##   ── ESTRUCTURA (str) ──

str(df)

## tibble [341,027 × 14] (S3: tbl_df/tbl/data.frame)
##  $ anon_customer_id: chr [1:341027] "CUST_8" "CUST_273" "CUST_158" "CUST_233" ...
##  $ anon_document_id: chr [1:341027] "XXXXXX2081" "XXXXXX2080" "XXXXXX1017" "XXXXXX0970" ...
##  $ terms_of_payment: chr [1:341027] NA NA "Z000" "Z914" ...
##  $ document_type   : chr [1:341027] "AB" "AB" "DA" "DA" ...
##  $ document_date   : Date[1:341027], format: "2023-03-31" "2023-03-31" ...
##  $ payment_date    : Date[1:341027], format: "2019-10-25" "2017-11-24" ...
##  $ net_due_date    : Date[1:341027], format: "2019-10-25" "2017-11-24" ...
##  $ arrears         : num [1:341027] 2347 3047 3 -33 39 ...
##  $ amount          : num [1:341027] 78063189 212841936 -40372100 1470462 -37890 ...
##  $ reason_code     : chr [1:341027] "76" "76" "81" "81" ...
##  $ clearing_date   : Date[1:341027], format: NA NA ...
##  $ year_month      : chr [1:341027] "2023/03" "2023/03" "2026/03" "2026/03" ...
##  $ estado_cartera  : chr [1:341027] "Abierta" "Abierta" "Abierta" "Abierta" ...
##  $ bucket_mora     : Ord.factor w/ 7 levels "Al dia"<"1-30"<..: 7 7 2 1 3 3 3 2 2 3 ...

sub_sep("RESUMEN ESTADÍSTICO (summary)")

## 
##   ── RESUMEN ESTADÍSTICO (summary) ──

print(summary(df))

##  anon_customer_id   anon_document_id   terms_of_payment   document_type     
##  Length:341027      Length:341027      Length:341027      Length:341027     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  document_date         payment_date         net_due_date       
##  Min.   :2022-01-03   Min.   :2007-08-17   Min.   :2007-08-17  
##  1st Qu.:2022-12-15   1st Qu.:2022-12-23   1st Qu.:2022-12-29  
##  Median :2023-12-10   Median :2023-12-19   Median :2023-12-22  
##  Mean   :2024-01-12   Mean   :2024-02-04   Mean   :2024-02-08  
##  3rd Qu.:2025-02-08   3rd Qu.:2025-02-25   3rd Qu.:2025-02-28  
##  Max.   :2026-03-28   Max.   :4025-04-22   Max.   :4025-04-22  
##                                                                
##     arrears              amount           reason_code       
##  Min.   :-28308.00   Min.   :-2.080e+10   Length:341027     
##  1st Qu.:     0.00   1st Qu.:-4.592e+05   Class :character  
##  Median :     6.00   Median : 2.899e+05   Mode  :character  
##  Mean   :    15.94   Mean   : 5.062e+05                     
##  3rd Qu.:    21.00   3rd Qu.: 5.940e+06                     
##  Max.   :  5863.00   Max.   : 2.080e+10                     
##                                                             
##  clearing_date         year_month        estado_cartera      bucket_mora    
##  Min.   :2022-01-03   Length:341027      Length:341027      Al dia :111154  
##  1st Qu.:2022-12-29   Class :character   Class :character   1-30   :173943  
##  Median :2023-12-18   Mode  :character   Mode  :character   31-60  : 32091  
##  Mean   :2024-01-20                                         61-90  :  9629  
##  3rd Qu.:2025-01-24                                         91-180 :  9867  
##  Max.   :2026-04-01                                         181-360:  3432  
##  NA's   :11974                                              >360   :   911

# ── 1.7 Resumen rápido con skimr ──────────────────────────────────────────────
sub_sep("RESUMEN SKIMR")

## 
##   ── RESUMEN SKIMR ──

print(skim(df))

## ── Data Summary ────────────────────────
##                            Values
## Name                       df    
## Number of rows             341027
## Number of columns          14    
## _______________________          
## Column type frequency:           
##   character                7     
##   Date                     4     
##   factor                   1     
##   numeric                  2     
## ________________________         
## Group variables            None  
## 
## ── Variable type: character ────────────────────────────────────────────────────
##   skim_variable    n_missing complete_rate min max empty n_unique whitespace
## 1 anon_customer_id         0         1       6   8     0      319          0
## 2 anon_document_id         0         1      10  10     0    10000          0
## 3 terms_of_payment     58231         0.829   4   4     0       36          0
## 4 document_type            0         1       2   2     0       12          0
## 5 reason_code         305226         0.105   2   3     0       18          0
## 6 year_month               0         1       7   7     0       51          0
## 7 estado_cartera           0         1       7   7     0        2          0
## 
## ── Variable type: Date ─────────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate min        max        median    
## 1 document_date         0         1     2022-01-03 2026-03-28 2023-12-10
## 2 payment_date          0         1     2007-08-17 4025-04-22 2023-12-19
## 3 net_due_date          0         1     2007-08-17 4025-04-22 2023-12-22
## 4 clearing_date     11974         0.965 2022-01-03 2026-04-01 2023-12-18
##   n_unique
## 1     1416
## 2     1640
## 3     1640
## 4     1172
## 
## ── Variable type: factor ───────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate ordered n_unique
## 1 bucket_mora           0             1 TRUE           7
##   top_counts                                     
## 1 1-3: 173943, Al : 111154, 31-: 32091, 91-: 9867
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate     mean         sd            p0
## 1 arrears               0             1     15.9       135.       -28308 
## 2 amount                0             1 506197.  148316687. -20800206706.
##        p25    p50      p75         p100 hist 
## 1       0       6      21         5863  ▁▁▁▁▇
## 2 -459152. 289884 5939778. 20800206706. ▁▁▇▁▁

# ── 1.8 Valores faltantes ─────────────────────────────────────────────────────
sub_sep("VALORES FALTANTES POR VARIABLE")

## 
##   ── VALORES FALTANTES POR VARIABLE ──

faltantes <- df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to="Variable", values_to="N_Faltantes") %>%
  mutate(
    Total       = nrow(df),
    Pct         = round(N_Faltantes / Total * 100, 2),
    Semaforo    = case_when(
      Pct == 0   ~ "✓ Sin faltantes",
      Pct <= 5   ~ "▲ Bajo (<5%)",
      Pct <= 20  ~ "● Moderado (5-20%)",
      TRUE        ~ "✖ Alto (>20%)"
    )
  ) %>%
  arrange(desc(Pct))

imprimir_tabla(faltantes)

## # A tibble: 14 × 5
##    Variable         N_Faltantes  Total   Pct Semaforo          
##    <chr>                  <int>  <int> <dbl> <chr>             
##  1 reason_code           305226 341027 89.5  ✖ Alto (>20%)     
##  2 terms_of_payment       58231 341027 17.1  ● Moderado (5-20%)
##  3 clearing_date          11974 341027  3.51 ▲ Bajo (<5%)      
##  4 anon_customer_id           0 341027  0    ✓ Sin faltantes   
##  5 anon_document_id           0 341027  0    ✓ Sin faltantes   
##  6 document_type              0 341027  0    ✓ Sin faltantes   
##  7 document_date              0 341027  0    ✓ Sin faltantes   
##  8 payment_date               0 341027  0    ✓ Sin faltantes   
##  9 net_due_date               0 341027  0    ✓ Sin faltantes   
## 10 arrears                    0 341027  0    ✓ Sin faltantes   
## 11 amount                     0 341027  0    ✓ Sin faltantes   
## 12 year_month                 0 341027  0    ✓ Sin faltantes   
## 13 estado_cartera             0 341027  0    ✓ Sin faltantes   
## 14 bucket_mora                0 341027  0    ✓ Sin faltantes

cat("\n  Total celdas faltantes :", formatC(sum(faltantes$N_Faltantes), big.mark=","), "\n")

## 
##   Total celdas faltantes : 375,431

cat("  Total celdas en base   :", formatC(nrow(df) * ncol(df), big.mark=","), "\n")

##   Total celdas en base   : 4,774,378

cat("  % faltantes global     :",
    round(sum(faltantes$N_Faltantes) / (nrow(df)*ncol(df)) * 100, 2), "%\n")

##   % faltantes global     : 7.86 %

# Gráfico de faltantes
p_faltantes <- faltantes %>% filter(Pct > 0) %>%
  ggplot(aes(x = reorder(Variable, Pct), y = Pct)) +
  geom_col(fill = C2, alpha = 0.85) +
  geom_text(aes(label = paste0(Pct, "%")), hjust = -0.1, size = 3.5) +
  coord_flip() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(title = "Porcentaje de Valores Faltantes por Variable",
       x = NULL, y = "% Faltantes") +
  tema
print(p_faltantes)

# ── 1.9 Duplicados ────────────────────────────────────────────────────────────
sub_sep("REGISTROS DUPLICADOS")

## 
##   ── REGISTROS DUPLICADOS ──

n_dup_filas <- sum(duplicated(df))
n_dup_docs  <- df %>% count(anon_document_id) %>% filter(n > 1) %>% nrow()
cat("  Filas completamente duplicadas  :", n_dup_filas, "\n")

##   Filas completamente duplicadas  : 3405

cat("  Document IDs con más de 1 fila  :", n_dup_docs, "\n")

##   Document IDs con más de 1 fila  : 10000

top_docs_dup <- df %>% count(anon_document_id, sort=TRUE) %>%
  filter(n > 1) %>% slice_head(n=10)
if (nrow(top_docs_dup) > 0) {
  cat("\n  Top 10 document IDs más repetidos:\n")
  print(top_docs_dup)
}

## 
##   Top 10 document IDs más repetidos:
## # A tibble: 10 × 2
##    anon_document_id     n
##    <chr>            <int>
##  1 XXXXXX2145         236
##  2 XXXXXX5881         223
##  3 XXXXXX3880         222
##  4 XXXXXX1107         220
##  5 XXXXXX1544         218
##  6 XXXXXX3807         217
##  7 XXXXXX0617         212
##  8 XXXXXX2836         212
##  9 XXXXXX5331         208
## 10 XXXXXX1914         197

# ── 1.10 Estadísticas descriptivas globales para numéricas ───────────────────
sub_sep("ESTADÍSTICAS DESCRIPTIVAS GLOBALES — VARIABLES NUMÉRICAS")

## 
##   ── ESTADÍSTICAS DESCRIPTIVAS GLOBALES — VARIABLES NUMÉRICAS ──

tab_num_global <- df %>%
  select(arrears, amount) %>%
  pivot_longer(everything(), names_to="Variable", values_to="Valor") %>%
  group_by(Variable) %>%
  summarise(
    N          = sum(!is.na(Valor)),
    Faltantes  = sum(is.na(Valor)),
    Media      = round(mean(Valor, na.rm=TRUE), 2),
    Mediana    = round(median(Valor, na.rm=TRUE), 2),
    Moda       = moda_fn(Valor),
    Min        = round(min(Valor, na.rm=TRUE), 2),
    Max        = round(max(Valor, na.rm=TRUE), 2),
    Rango      = round(max(Valor, na.rm=TRUE)-min(Valor, na.rm=TRUE), 2),
    DesvStd    = round(sd(Valor, na.rm=TRUE), 2),
    CV_pct     = cv_fn(Valor),
    Asimetria  = round(skewness(Valor, na.rm=TRUE), 3),
    Curtosis   = round(kurtosis(Valor, na.rm=TRUE), 3),
    N_Outliers = length(outliers_iqr(Valor)),
    .groups = "drop"
  )
imprimir_tabla(tab_num_global)

## # A tibble: 2 × 14
##   Variable      N Faltantes    Media Mediana Moda            Min     Max   Rango
##   <chr>     <int>     <int>    <dbl>   <dbl> <chr>         <dbl>   <dbl>   <dbl>
## 1 amount   341027         0 506197.   289884 0     -20800206706. 2.08e10 4.16e10
## 2 arrears  341027         0     15.9       6 0           -28308  5.86e 3 3.42e 4
## # ℹ 5 more variables: DesvStd <dbl>, CV_pct <dbl>, Asimetria <dbl>,
## #   Curtosis <dbl>, N_Outliers <int>

# ── 1.11 Tablas de frecuencias globales para categóricas ─────────────────────
sub_sep("TABLAS DE FRECUENCIAS GLOBALES — VARIABLES CATEGÓRICAS")

## 
##   ── TABLAS DE FRECUENCIAS GLOBALES — VARIABLES CATEGÓRICAS ──

vars_cat <- c("terms_of_payment","document_type","estado_cartera",
              "year_month","bucket_mora","reason_code")

for (v in vars_cat) {
  cat("\n  [", v, "]\n", sep="")
  tab <- df %>%
    count(.data[[v]], name="Freq_Abs") %>%
    rename(Categoria = 1) %>%
    mutate(Categoria  = as.character(Categoria),
           Freq_Rel   = round(Freq_Abs/sum(Freq_Abs)*100, 2),
           Freq_Acum  = cumsum(Freq_Rel)) %>%
    arrange(desc(Freq_Abs))
  print(head(tab, 15), n=15)
}

## 
##   [terms_of_payment]
## # A tibble: 15 × 4
##    Categoria Freq_Abs Freq_Rel Freq_Acum
##    <chr>        <int>    <dbl>     <dbl>
##  1 Z914        122203    35.8      82.9 
##  2 Z522         62352    18.3      30.0 
##  3 <NA>         58231    17.1     100.0 
##  4 Z526         29496     8.65     39.2 
##  5 Z521         24874     7.29     11.7 
##  6 Z913         19918     5.84     47.1 
##  7 Z000         13488     3.96      4.08
##  8 Z540          4845     1.42     40.6 
##  9 Z525          1947     0.57     30.6 
## 10 Z672          1466     0.43     41.1 
## 11 Z691           422     0.12     41.2 
## 12 Z040           356     0.1       4.18
## 13 Z505           329     0.1       4.4 
## 14 Z090           272     0.08      4.26
## 15 B045           215     0.06      0.08
## 
##   [document_type]
## # A tibble: 12 × 4
##    Categoria Freq_Abs Freq_Rel Freq_Acum
##    <chr>        <int>    <dbl>     <dbl>
##  1 RV          141669    41.5      95.2 
##  2 DZ           85530    25.1      39.1 
##  3 NC           34420    10.1      49.2 
##  4 DA           28093     8.24     14.0 
##  5 AB           16506     4.84      4.84
##  6 ZV           16415     4.81    100.0 
##  7 RU           15260     4.47     53.6 
##  8 CC            3039     0.89      5.73
##  9 DR              62     0.02     14   
## 10 DG              18     0.01     14.0 
## 11 SA              11     0        95.2 
## 12 ND               4     0        49.2 
## 
##   [estado_cartera]
## # A tibble: 2 × 4
##   Categoria Freq_Abs Freq_Rel Freq_Acum
##   <chr>        <int>    <dbl>     <dbl>
## 1 Cerrada     334952    98.2     100   
## 2 Abierta       6075     1.78      1.78
## 
##   [year_month]
## # A tibble: 15 × 4
##    Categoria Freq_Abs Freq_Rel Freq_Acum
##    <chr>        <int>    <dbl>     <dbl>
##  1 2026/03      10750     3.15    100.0 
##  2 2023/09       8950     2.62     45.0 
##  3 2022/06       8793     2.58     13.1 
##  4 2022/09       8688     2.55     19.8 
##  5 2023/12       8671     2.54     51.5 
##  6 2022/03       8231     2.41      6.23
##  7 2023/06       8215     2.41     38.4 
##  8 2022/12       8096     2.37     26.1 
##  9 2022/05       7945     2.33     10.5 
## 10 2023/03       7900     2.32     32.1 
## 11 2025/09       7330     2.15     88.3 
## 12 2024/09       7285     2.14     68.1 
## 13 2024/06       7274     2.13     62.9 
## 14 2024/03       7174     2.1      57.2 
## 15 2022/08       7134     2.09     17.2 
## 
##   [bucket_mora]
## # A tibble: 7 × 4
##   Categoria Freq_Abs Freq_Rel Freq_Acum
##   <chr>        <int>    <dbl>     <dbl>
## 1 1-30        173943    51.0       83.6
## 2 Al dia      111154    32.6       32.6
## 3 31-60        32091     9.41      93.0
## 4 91-180        9867     2.89      98.7
## 5 61-90         9629     2.82      95.8
## 6 181-360       3432     1.01      99.7
## 7 >360           911     0.27     100  
## 
##   [reason_code]
## # A tibble: 15 × 4
##    Categoria Freq_Abs Freq_Rel Freq_Acum
##    <chr>        <int>    <dbl>     <dbl>
##  1 <NA>        305226    89.5     100.  
##  2 50           11999     3.52      5.08
##  3 81            9942     2.92     10.5 
##  4 62            7207     2.11      7.34
##  5 21            2171     0.64      0.85
##  6 403           2037     0.6       1.49
##  7 12             700     0.21      0.21
##  8 59             495     0.15      5.23
##  9 76             343     0.1       7.58
## 10 44             247     0.07      1.56
## 11 72             208     0.06      7.48
## 12 71             198     0.06      7.42
## 13 22              72     0.02      0.87
## 14 70              59     0.02      7.36
## 15 90              58     0.02     10.5

# ═══════════════════════════════════════════════════════════════════════════════
# PARTE 2 ▸ ANÁLISIS INDIVIDUAL POR VARIABLE (orden original de la base)
# ═══════════════════════════════════════════════════════════════════════════════

separador("PARTE 2 — ANÁLISIS INDIVIDUAL POR VARIABLE")

## 
## ═════════════════════════════════════════════════════════════════
##  PARTE 2 — ANÁLISIS INDIVIDUAL POR VARIABLE
## ═════════════════════════════════════════════════════════════════

# ─────────────────────────────────────────────────────────────────────────────
# Función genérica: análisis de variable IDENTIFICADOR / ID
# ─────────────────────────────────────────────────────────────────────────────
analizar_id <- function(df, var, numero) {
  separador(paste0("VARIABLE ", numero, ": ", var, "  [IDENTIFICADOR]"))
  vec <- df[[var]]

  cat("  Tipo en R        :", class(vec)[1], "\n")
  cat("  Tipo analítico   : Identificador / Código único\n\n")

  n_tot  <- length(vec)
  n_na   <- sum(is.na(vec))
  n_uni  <- n_distinct(vec, na.rm=TRUE)
  n_dup  <- n_tot - n_uni
  pct_u  <- round(n_uni/n_tot*100, 2)

  cat("  Total registros  :", formatC(n_tot, big.mark=","), "\n")
  cat("  Valores únicos   :", formatC(n_uni, big.mark=","), "\n")
  cat("  Duplicados       :", formatC(n_dup, big.mark=","), "\n")
  cat("  Faltantes        :", n_na, "\n")
  cat("  % Unicidad       :", pct_u, "%\n")
  cat("  Uso sugerido     :",
      ifelse(pct_u > 95, "Clave primaria / Join key",
             "Agrupador / Segmentador (no es clave única)"), "\n")

  # Frecuencia de repetición
  freq_rep <- df %>% count(.data[[var]], name="n") %>%
    count(n, name="cantidad_de_IDs") %>%
    rename(veces_repetido = n) %>% arrange(veces_repetido)
  cat("\n  Distribución de frecuencia de repetición:\n")
  print(head(freq_rep, 10))

  # Top 15 más frecuentes
  top15 <- df %>% count(.data[[var]], name="Registros", sort=TRUE) %>%
    slice_head(n=15) %>% rename(ID=1)
  cat("\n  Top 15 valores más frecuentes:\n")
  print(top15)

  # Gráfico
  p <- top15 %>%
    ggplot(aes(x=reorder(ID, Registros), y=Registros)) +
    geom_col(fill=C1, alpha=0.85) +
    geom_text(aes(label=comma(Registros)), hjust=-0.1, size=3) +
    coord_flip() +
    scale_y_continuous(labels=comma, expand=expansion(mult=c(0,.15))) +
    labs(title = paste("Top 15 —", var),
         subtitle = paste("Total valores únicos:", formatC(n_uni, big.mark=",")),
         x=NULL, y="Registros") +
    tema
  print(p)
}

# ─────────────────────────────────────────────────────────────────────────────
# Función genérica: análisis de variable CATEGÓRICA
# ─────────────────────────────────────────────────────────────────────────────
analizar_categorica <- function(df, var, numero, top_n=15) {
  separador(paste0("VARIABLE ", numero, ": ", var, "  [CATEGÓRICA]"))
  vec <- as.character(df[[var]])

  cat("  Tipo en R        :", class(df[[var]])[1], "\n")
  cat("  Tipo analítico   : Cualitativa nominal/ordinal\n\n")

  n_tot  <- length(vec)
  n_na   <- sum(is.na(vec) | vec == "")
  n_cat  <- n_distinct(vec[!is.na(vec) & vec != ""])
  tab <- sort(table(vec[!is.na(vec) & vec != ""]), decreasing=TRUE)

  cat("  Registros totales  :", formatC(n_tot, big.mark=","), "\n")
  cat("  Valores faltantes  :", n_na, "(", round(n_na/n_tot*100,2), "%)\n")
  cat("  Categorías únicas  :", n_cat, "\n")
  cat("  Categoría más frec.:", names(tab)[1], "→", tab[1], "registros\n")
  cat("  Categoría menos fr.:", names(tab)[length(tab)],
      "→", tab[length(tab)], "registros\n")

  # Tabla de frecuencias
  freq_tabla <- tibble(
    Categoria  = names(tab),
    Freq_Abs   = as.integer(tab),
    Freq_Rel   = round(as.integer(tab)/sum(as.integer(tab))*100, 2),
    Freq_Acum  = cumsum(round(as.integer(tab)/sum(as.integer(tab))*100, 2))
  )
  cat("\n  Tabla de frecuencias (top", min(top_n, nrow(freq_tabla)), "):\n")
  print(head(freq_tabla, top_n))

  cat("\n  Categorías raras (<1% del total):",
      sum(freq_tabla$Freq_Rel < 1), "\n")

  # Gráfico de barras
  top_df <- head(freq_tabla, top_n)
  p_bar <- ggplot(top_df, aes(x=reorder(Categoria, Freq_Abs), y=Freq_Abs)) +
    geom_col(fill=C1, alpha=0.85) +
    geom_text(aes(label=paste0(Freq_Rel, "%")), hjust=-0.1, size=3.2) +
    coord_flip() +
    scale_y_continuous(labels=comma, expand=expansion(mult=c(0,.18))) +
    labs(title    = paste("Frecuencia:", var),
         subtitle = paste("Top", min(top_n,n_cat), "de", n_cat, "categorías"),
         x=NULL, y="Frecuencia absoluta") +
    tema

  # Gráfico circular (solo si ≤ 10 categorías)
  if (n_cat <= 10) {
    p_pie <- top_df %>%
      mutate(Categoria = factor(Categoria, levels=rev(Categoria))) %>%
      ggplot(aes(x="", y=Freq_Rel, fill=Categoria)) +
      geom_col(width=1, color="white") +
      coord_polar("y") +
      scale_fill_manual(values=PALETA[seq_len(min(n_cat,14))]) +
      geom_text(aes(label=paste0(Freq_Rel,"%")),
                position=position_stack(vjust=0.5), size=3, color="white") +
      labs(title=paste("Distribución:", var), x=NULL, y=NULL) +
      theme_void(base_size=11) +
      theme(plot.title=element_text(face="bold",color=C1,size=12),
            legend.position="right")
    print(p_bar / p_pie)
  } else {
    print(p_bar)
  }
}

# ─────────────────────────────────────────────────────────────────────────────
# Función genérica: análisis de variable FECHA
# ─────────────────────────────────────────────────────────────────────────────
analizar_fecha <- function(df, var, numero) {
  separador(paste0("VARIABLE ", numero, ": ", var, "  [FECHA]"))
  vec <- df[[var]]

  cat("  Tipo en R        :", class(vec)[1], "\n")
  cat("  Tipo analítico   : Variable de fecha/tiempo\n\n")

  n_na   <- sum(is.na(vec))
  n_val  <- sum(!is.na(vec))
  f_min  <- min(vec, na.rm=TRUE)
  f_max  <- max(vec, na.rm=TRUE)
  rango_dias <- as.numeric(f_max - f_min)

  cat("  Registros válidos  :", formatC(n_val, big.mark=","), "\n")
  cat("  Faltantes          :", formatC(n_na,  big.mark=","),
      "(", round(n_na/length(vec)*100,2), "%)\n")
  cat("  Fecha mínima       :", as.character(f_min), "\n")
  cat("  Fecha máxima       :", as.character(f_max), "\n")
  cat("  Rango en días      :", formatC(rango_dias, big.mark=","), "\n")
  cat("  Rango en años      :", round(rango_dias/365, 1), "\n")

  # Fechas fuera de rango razonable
  n_muy_ant <- sum(vec < as.Date("2000-01-01"), na.rm=TRUE)
  n_futuras <- sum(vec > Sys.Date() + 730, na.rm=TRUE)
  cat("  Fechas antes 2000  :", n_muy_ant, "\n")
  cat("  Fechas > hoy+2años :", n_futuras, "\n")

  # Registros por año
  df_anio <- df %>% filter(!is.na(.data[[var]])) %>%
    mutate(Anio = year(.data[[var]])) %>% count(Anio)
  cat("\n  Registros por año:\n"); print(df_anio, n=Inf)

  # Registros por mes del año (agregado)
  df_mes <- df %>% filter(!is.na(.data[[var]])) %>%
    mutate(Mes = month(.data[[var]], label=TRUE, abbr=FALSE)) %>%
    count(Mes)
  cat("\n  Registros por mes (agregado de todos los años):\n")
  print(df_mes, n=Inf)

  # Gráfico: registros por año
  p_anio <- ggplot(df_anio, aes(x=factor(Anio), y=n)) +
    geom_col(fill=C1, alpha=0.85) +
    geom_text(aes(label=comma(n)), vjust=-0.3, size=3) +
    scale_y_continuous(labels=comma, expand=expansion(mult=c(0,.12))) +
    labs(title=paste("Registros por año:", var), x="Año", y="Registros") +
    tema + theme(axis.text.x=element_text(angle=45,hjust=1))

  # Gráfico: tendencia mensual (últimos 5 años)
  anio_corte <- year(Sys.Date()) - 5
  df_mensual <- df %>%
    filter(!is.na(.data[[var]]), year(.data[[var]]) >= anio_corte) %>%
    mutate(Mes = floor_date(.data[[var]], "month")) %>%
    count(Mes)

  p_mensual <- ggplot(df_mensual, aes(x=Mes, y=n)) +
    geom_line(color=C1, linewidth=1) +
    geom_point(color=C2, size=1.5) +
    scale_x_date(date_breaks="3 months", date_labels="%b %Y") +
    scale_y_continuous(labels=comma) +
    labs(title    = paste("Tendencia mensual:", var),
         subtitle = paste("Últimos 5 años (desde", anio_corte, ")"),
         x=NULL, y="Registros") +
    tema + theme(axis.text.x=element_text(angle=45,hjust=1))

  print(p_anio / p_mensual)
}

# ─────────────────────────────────────────────────────────────────────────────
# Función genérica: análisis de variable NUMÉRICA
# ─────────────────────────────────────────────────────────────────────────────
analizar_numerica <- function(df, var, numero, nombre_largo=NULL) {
  etiq <- if (!is.null(nombre_largo)) nombre_largo else var
  separador(paste0("VARIABLE ", numero, ": ", var, "  [NUMÉRICA]"))
  vec <- df[[var]]

  cat("  Tipo en R        :", class(vec)[1], "\n")
  cat("  Tipo analítico   : Cuantitativa continua\n\n")

  # ── Estadísticas descriptivas ──────────────────────────────────────────────
  n_val  <- sum(!is.na(vec)); n_na <- sum(is.na(vec))
  media  <- mean(vec, na.rm=TRUE); med   <- median(vec, na.rm=TRUE)
  moda_v <- moda_fn(vec)
  mn     <- min(vec, na.rm=TRUE);  mx    <- max(vec, na.rm=TRUE)
  rng    <- mx - mn
  q1     <- quantile(vec,.25,na.rm=TRUE); q3 <- quantile(vec,.75,na.rm=TRUE)
  iqr_v  <- q3 - q1
  desv   <- sd(vec, na.rm=TRUE); varz <- var(vec, na.rm=TRUE)
  cv     <- if (media != 0) cv_fn(vec) else NA
  asim   <- skewness(vec, na.rm=TRUE); kurt <- kurtosis(vec, na.rm=TRUE)
  p5     <- quantile(vec,.05,na.rm=TRUE); p10 <- quantile(vec,.10,na.rm=TRUE)
  p90    <- quantile(vec,.90,na.rm=TRUE); p95 <- quantile(vec,.95,na.rm=TRUE)
  p99    <- quantile(vec,.99,na.rm=TRUE)
  out_v  <- outliers_iqr(vec)

  cat("  ─ CONTEO ─────────────────────────────────────────────\n")
  cat("  Registros válidos  :", formatC(n_val, big.mark=","), "\n")
  cat("  Valores faltantes  :", n_na, "(", round(n_na/length(vec)*100,2), "%)\n")

  cat("\n  ─ TENDENCIA CENTRAL ──────────────────────────────────\n")
  cat("  Media              :", formatC(round(media,2), big.mark=","), "\n")
  cat("  Mediana            :", formatC(round(med,2),   big.mark=","), "\n")
  cat("  Moda               :", moda_v, "\n")

  cat("\n  ─ DISPERSIÓN ─────────────────────────────────────────\n")
  cat("  Mínimo             :", formatC(round(mn,2), big.mark=","), "\n")
  cat("  Máximo             :", formatC(round(mx,2), big.mark=","), "\n")
  cat("  Rango              :", formatC(round(rng,2), big.mark=","), "\n")
  cat("  Varianza           :", formatC(round(varz,2), big.mark=","), "\n")
  cat("  Desv. estándar     :", formatC(round(desv,2), big.mark=","), "\n")
  cat("  Coef. variación    :", if(!is.na(cv)) paste0(cv, "%") else "N/A", "\n")
  cat("  IQR (Q3-Q1)        :", formatC(round(iqr_v,2), big.mark=","), "\n")

  cat("\n  ─ CUARTILES Y PERCENTILES ────────────────────────────\n")
  cat("  P5                 :", formatC(round(p5,2),  big.mark=","), "\n")
  cat("  P10                :", formatC(round(p10,2), big.mark=","), "\n")
  cat("  Q1 (P25)           :", formatC(round(q1,2),  big.mark=","), "\n")
  cat("  Mediana (P50)      :", formatC(round(med,2), big.mark=","), "\n")
  cat("  Q3 (P75)           :", formatC(round(q3,2),  big.mark=","), "\n")
  cat("  P90                :", formatC(round(p90,2), big.mark=","), "\n")
  cat("  P95                :", formatC(round(p95,2), big.mark=","), "\n")
  cat("  P99                :", formatC(round(p99,2), big.mark=","), "\n")

  cat("\n  ─ FORMA DE LA DISTRIBUCIÓN ───────────────────────────\n")
  cat("  Asimetría          :", round(asim,3), "\n")
  cat("  Curtosis           :", round(kurt,3), "\n")
  cat("  Interpretación     :", case_when(
    abs(asim) < 0.5 ~ "Distribución aproximadamente simétrica",
    asim >  0.5     ~ "Sesgo positivo (cola derecha): valores altos extremos",
    asim < -0.5     ~ "Sesgo negativo (cola izquierda): valores bajos extremos"
  ), "\n")
  cat("  Curtosis interp.   :", case_when(
    kurt > 3 ~ "Leptocúrtica: colas pesadas, más outliers que normal",
    kurt < 3 ~ "Platicúrtica: colas ligeras, menos outliers",
    TRUE      ~ "Mesocúrtica: similar a distribución normal"
  ), "\n")

  cat("\n  ─ OUTLIERS IQR ───────────────────────────────────────\n")
  lims <- lims_iqr(vec)
  cat("  Límite inferior IQR:", formatC(round(lims$inf,2), big.mark=","), "\n")
  cat("  Límite superior IQR:", formatC(round(lims$sup,2), big.mark=","), "\n")
  cat("  N° outliers        :", formatC(length(out_v), big.mark=","), "\n")
  cat("  % de outliers      :", round(length(out_v)/n_val*100,2), "%\n")
  if (length(out_v) > 0) {
    cat("  Resumen outliers   : Min=", round(min(out_v),2),
        "| Max=", round(max(out_v),2), "\n")
  }

  # ── Gráficos ──────────────────────────────────────────────────────────────
  vec_plot <- vec[!is.na(vec) & vec >= lims$inf & vec <= lims$sup]

  # Histograma
  p_hist <- ggplot(data.frame(x=vec_plot), aes(x=x)) +
    geom_histogram(fill=C1, color="white", bins=50, alpha=0.85) +
    geom_vline(xintercept=media, color=C2, linetype="dashed", linewidth=1) +
    geom_vline(xintercept=med,   color=C3, linetype="dashed", linewidth=1) +
    scale_x_continuous(labels=comma) +
    scale_y_continuous(labels=comma) +
    labs(title    = paste("Histograma:", etiq),
         subtitle = "Rojo = Media  |  Naranja = Mediana  |  Sin outliers IQR",
         x=etiq, y="Frecuencia") +
    tema

  # Densidad
  p_dens <- ggplot(data.frame(x=vec_plot), aes(x=x)) +
    geom_density(fill=C1, color=C1, alpha=0.4) +
    geom_vline(xintercept=media, color=C2, linetype="dashed") +
    geom_vline(xintercept=med,   color=C3, linetype="dashed") +
    scale_x_continuous(labels=comma) +
    labs(title="Densidad", x=etiq, y="Densidad") +
    tema

  # Boxplot completo (con outliers)
  p_box <- ggplot(data.frame(x=vec), aes(y=x)) +
    geom_boxplot(fill=C1, alpha=0.65,
                 outlier.color=C2, outlier.alpha=0.15, outlier.size=0.8) +
    scale_y_continuous(labels=comma) +
    labs(title    = paste("Boxplot:", etiq),
         subtitle = paste("Outliers IQR:", formatC(length(out_v), big.mark=",")),
         y=etiq) +
    tema

  # Gráfico 4: distribución por rangos (deciles o percentiles únicos)
  # Se usa unique() para evitar el error "breaks no son únicos" cuando
  # hay muchos valores repetidos (ej: miles de ceros en arrears).
  breaks_dec <- unique(quantile(vec, probs = seq(0, 1, .1), na.rm = TRUE))

  if (length(breaks_dec) >= 3) {
    # Hay suficientes breaks únicos → usar cut()
    df_dec <- data.frame(x = vec) %>%
      filter(!is.na(x)) %>%
      mutate(decil = cut(x, breaks = breaks_dec, include.lowest = TRUE)) %>%
      count(decil) %>%
      filter(!is.na(decil))

    p_dec <- ggplot(df_dec, aes(x = decil, y = n)) +
      geom_col(fill = C3, alpha = 0.85, color = "white") +
      scale_y_continuous(labels = comma) +
      labs(title = "Distribución por rangos (deciles únicos)",
           x = "Rango", y = "Registros") +
      tema +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))

  } else {
    # Pocos valores únicos (variable muy discreta) → barras simples por valor
    df_dec <- data.frame(x = vec) %>%
      filter(!is.na(x)) %>%
      count(x) %>%
      slice_head(n = 20)   # máximo 20 valores distintos

    p_dec <- ggplot(df_dec, aes(x = factor(x), y = n)) +
      geom_col(fill = C3, alpha = 0.85, color = "white") +
      scale_y_continuous(labels = comma) +
      labs(title = "Distribución por valor (variable discreta)",
           x = "Valor", y = "Registros") +
      tema +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
  }

  # Panel 2×2
  panel <- (p_hist | p_dens) / (p_box | p_dec)
  panel <- panel + plot_annotation(
    title = paste("Análisis gráfico:", etiq),
    theme = theme(plot.title=element_text(face="bold",size=14,color=C1))
  )
  print(panel)
}


# ═══════════════════════════════════════════════════════════════════════════════
# ANÁLISIS INDIVIDUAL — VARIABLE POR VARIABLE EN ORDEN ORIGINAL
# ═══════════════════════════════════════════════════════════════════════════════

# ─────────────────────────────────────────────────────────────────────────────
# VAR 1: Anon_Customer_ID
# ─────────────────────────────────────────────────────────────────────────────
analizar_id(df, "anon_customer_id", 1)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 1: ANON_CUSTOMER_ID  [IDENTIFICADOR]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : character 
##   Tipo analítico   : Identificador / Código único
## 
##   Total registros  : 341,027 
##   Valores únicos   : 319 
##   Duplicados       : 340,708 
##   Faltantes        : 0 
##   % Unicidad       : 0.09 %
##   Uso sugerido     : Agrupador / Segmentador (no es clave única) 
## 
##   Distribución de frecuencia de repetición:
## # A tibble: 10 × 2
##    veces_repetido cantidad_de_IDs
##             <int>           <int>
##  1              1               7
##  2              2              14
##  3              3               3
##  4              4               5
##  5              5               5
##  6              6               5
##  7              7               3
##  8              8               7
##  9              9               3
## 10             10               2
## 
##   Top 15 valores más frecuentes:
## # A tibble: 15 × 2
##    ID       Registros
##    <chr>        <int>
##  1 CUST_317     65778
##  2 CUST_95      24522
##  3 CUST_216     19360
##  4 CUST_76       9911
##  5 CUST_205      8936
##  6 CUST_70       7709
##  7 CUST_101      6612
##  8 CUST_263      6608
##  9 CUST_141      5638
## 10 CUST_270      5623
## 11 CUST_307      5400
## 12 CUST_17       5016
## 13 CUST_118      4704
## 14 CUST_233      4256
## 15 CUST_58       4012

cat("\n  ─ ANÁLISIS ADICIONAL: CUSTOMER ──────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL: CUSTOMER ──────────────────────────

cat("  Clientes únicos:", n_distinct(df$anon_customer_id), "\n")

##   Clientes únicos: 319

# Top clientes por monto
top_cust_monto <- df %>% filter(!is.na(amount)) %>%
  group_by(anon_customer_id) %>%
  summarise(Monto_Total=sum(amount,na.rm=TRUE), N=n(), .groups="drop") %>%
  arrange(desc(Monto_Total)) %>% slice_head(n=15) %>%
  mutate(Pct_Monto=round(Monto_Total/sum(Monto_Total)*100,2))
cat("\n  Top 15 clientes por monto total:\n")

## 
##   Top 15 clientes por monto total:

print(top_cust_monto)

## # A tibble: 15 × 4
##    anon_customer_id  Monto_Total     N Pct_Monto
##    <chr>                   <dbl> <int>     <dbl>
##  1 CUST_317         20676237893. 65778     16.9 
##  2 CUST_218         19482750020.  1065     15.9 
##  3 CUST_215         17363198631.  2093     14.2 
##  4 CUST_263         10633703698.  6608      8.68
##  5 CUST_281          9744251100   3214      7.95
##  6 CUST_216          7224419079. 19360      5.89
##  7 CUST_17           5564459842.  5016      4.54
##  8 CUST_95           5492939964. 24522      4.48
##  9 CUST_141          4723907294.  5638      3.85
## 10 CUST_133          4568406870.   335      3.73
## 11 CUST_165          4457047781.  2561      3.64
## 12 CUST_76           3671116620.  9911      3   
## 13 CUST_258          3027124939.  1327      2.47
## 14 CUST_203          3026880498.  2561      2.47
## 15 CUST_131          2897084242.  1500      2.36

# Concentración (curva tipo Lorenz)
df_conc <- df %>% filter(!is.na(amount)) %>%
  group_by(anon_customer_id) %>%
  summarise(mt=sum(amount,na.rm=TRUE),.groups="drop") %>%
  arrange(desc(mt)) %>%
  mutate(x=row_number()/n(), y=cumsum(mt)/sum(mt))

p_lorenz <- ggplot(df_conc, aes(x=x,y=y)) +
  geom_line(color=C1, linewidth=1.2) +
  geom_abline(slope=1, intercept=0, linetype="dashed", color="gray60") +
  geom_vline(xintercept=.20, linetype="dotted", color=C2) +
  annotate("text",x=.22,y=.05,label="20% clientes",color=C2,size=3.5,hjust=0) +
  scale_x_continuous(labels=percent) +
  scale_y_continuous(labels=percent) +
  labs(title="Concentración de monto por cliente (curva tipo Lorenz)",
       x="% acumulado de clientes", y="% acumulado de monto") +
  tema
print(p_lorenz)

# ─────────────────────────────────────────────────────────────────────────────
# VAR 2: Anon_Document_ID
# ─────────────────────────────────────────────────────────────────────────────
analizar_id(df, "anon_document_id", 2)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 2: ANON_DOCUMENT_ID  [IDENTIFICADOR]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : character 
##   Tipo analítico   : Identificador / Código único
## 
##   Total registros  : 341,027 
##   Valores únicos   : 10,000 
##   Duplicados       : 331,027 
##   Faltantes        : 0 
##   % Unicidad       : 2.93 %
##   Uso sugerido     : Agrupador / Segmentador (no es clave única) 
## 
##   Distribución de frecuencia de repetición:
## # A tibble: 10 × 2
##    veces_repetido cantidad_de_IDs
##             <int>           <int>
##  1             14              13
##  2             15              44
##  3             16              89
##  4             17             123
##  5             18             201
##  6             19             267
##  7             20             286
##  8             21             265
##  9             22             230
## 10             23             271
## 
##   Top 15 valores más frecuentes:
## # A tibble: 15 × 2
##    ID         Registros
##    <chr>          <int>
##  1 XXXXXX2145       236
##  2 XXXXXX5881       223
##  3 XXXXXX3880       222
##  4 XXXXXX1107       220
##  5 XXXXXX1544       218
##  6 XXXXXX3807       217
##  7 XXXXXX0617       212
##  8 XXXXXX2836       212
##  9 XXXXXX5331       208
## 10 XXXXXX1914       197
## 11 XXXXXX1847       191
## 12 XXXXXX3280       177
## 13 XXXXXX0089       176
## 14 XXXXXX0692       170
## 15 XXXXXX1034       169

# ─────────────────────────────────────────────────────────────────────────────
# VAR 3: Terms_of_Payment
# ─────────────────────────────────────────────────────────────────────────────
analizar_categorica(df, "terms_of_payment", 3, top_n=15)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 3: TERMS_OF_PAYMENT  [CATEGÓRICA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : character 
##   Tipo analítico   : Cualitativa nominal/ordinal
## 
##   Registros totales  : 341,027 
##   Valores faltantes  : 58231 ( 17.08 %)
##   Categorías únicas  : 36 
##   Categoría más frec.: Z914 → 122203 registros
##   Categoría menos fr.: Z721 → 1 registros
## 
##   Tabla de frecuencias (top 15 ):
## # A tibble: 15 × 4
##    Categoria Freq_Abs Freq_Rel Freq_Acum
##    <chr>        <int>    <dbl>     <dbl>
##  1 Z914        122203    43.2       43.2
##  2 Z522         62352    22.0       65.3
##  3 Z526         29496    10.4       75.7
##  4 Z521         24874     8.8       84.5
##  5 Z913         19918     7.04      91.5
##  6 Z000         13488     4.77      96.3
##  7 Z540          4845     1.71      98.0
##  8 Z525          1947     0.69      98.7
##  9 Z672          1466     0.52      99.2
## 10 Z691           422     0.15      99.4
## 11 Z040           356     0.13      99.5
## 12 Z505           329     0.12      99.6
## 13 Z090           272     0.1       99.7
## 14 B045           215     0.08      99.8
## 15 P030           151     0.05      99.9
## 
##   Categorías raras (<1% del total): 29

cat("\n  ─ NOTA ANALÍTICA ─────────────────────────────────────────\n")

## 
##   ─ NOTA ANALÍTICA ─────────────────────────────────────────

cat("  Terms_of_Payment vacía ('') puede indicar facturas sin\n")

##   Terms_of_Payment vacía ('') puede indicar facturas sin

cat("  condición asignada. Revisar si corresponde a un tipo\n")

##   condición asignada. Revisar si corresponde a un tipo

cat("  específico de documento (ej: abonos o notas crédito).\n")

##   específico de documento (ej: abonos o notas crédito).

tab_terms_tipo <- df %>%
  mutate(terms_vacia = terms_of_payment == "" | is.na(terms_of_payment)) %>%
  count(document_type, terms_vacia) %>%
  arrange(document_type, terms_vacia)
cat("\n  Terms_of_Payment vacía por Document_Type:\n")

## 
##   Terms_of_Payment vacía por Document_Type:

print(tab_terms_tipo)

## # A tibble: 16 × 3
##    document_type terms_vacia      n
##    <chr>         <lgl>        <int>
##  1 AB            FALSE        14879
##  2 AB            TRUE          1627
##  3 CC            FALSE         3039
##  4 DA            FALSE        23074
##  5 DA            TRUE          5019
##  6 DG            FALSE           18
##  7 DR            FALSE           62
##  8 DZ            FALSE        45256
##  9 DZ            TRUE         40274
## 10 NC            FALSE        34420
## 11 ND            FALSE            4
## 12 RU            FALSE        15260
## 13 RV            FALSE       141669
## 14 SA            FALSE           11
## 15 ZV            FALSE         5104
## 16 ZV            TRUE         11311

# ─────────────────────────────────────────────────────────────────────────────
# VAR 4: Document_Type
# ─────────────────────────────────────────────────────────────────────────────
analizar_categorica(df, "document_type", 4, top_n=15)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 4: DOCUMENT_TYPE  [CATEGÓRICA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : character 
##   Tipo analítico   : Cualitativa nominal/ordinal
## 
##   Registros totales  : 341,027 
##   Valores faltantes  : 0 ( 0 %)
##   Categorías únicas  : 12 
##   Categoría más frec.: RV → 141669 registros
##   Categoría menos fr.: ND → 4 registros
## 
##   Tabla de frecuencias (top 12 ):
## # A tibble: 12 × 4
##    Categoria Freq_Abs Freq_Rel Freq_Acum
##    <chr>        <int>    <dbl>     <dbl>
##  1 RV          141669    41.5       41.5
##  2 DZ           85530    25.1       66.6
##  3 NC           34420    10.1       76.7
##  4 DA           28093     8.24      85.0
##  5 AB           16506     4.84      89.8
##  6 ZV           16415     4.81      94.6
##  7 RU           15260     4.47      99.1
##  8 CC            3039     0.89     100.0
##  9 DR              62     0.02     100.0
## 10 DG              18     0.01     100.0
## 11 SA              11     0        100.0
## 12 ND               4     0        100.0
## 
##   Categorías raras (<1% del total): 5

cat("\n  ─ NOTA ANALÍTICA ─────────────────────────────────────────\n")

## 
##   ─ NOTA ANALÍTICA ─────────────────────────────────────────

cat("  Tipos comunes en bases SAP/I2C:\n")

##   Tipos comunes en bases SAP/I2C:

cat("    DA = Débito estándar (factura)\n")

##     DA = Débito estándar (factura)

cat("    AB = Abono / Nota crédito\n")

##     AB = Abono / Nota crédito

cat("    DZ = Pago\n")

##     DZ = Pago

cat("    KG = Nota crédito proveedor\n")

##     KG = Nota crédito proveedor

# Monto promedio por tipo de documento
tab_monto_tipo <- df %>% filter(!is.na(amount)) %>%
  group_by(document_type) %>%
  summarise(
    N=n(),
    Monto_Total=round(sum(amount,na.rm=TRUE),0),
    Monto_Promedio=round(mean(amount,na.rm=TRUE),0),
    Monto_Mediana=round(median(amount,na.rm=TRUE),0),
    .groups="drop"
  ) %>% arrange(desc(N))
cat("\n  Monto por tipo de documento:\n")

## 
##   Monto por tipo de documento:

print(tab_monto_tipo)

## # A tibble: 12 × 5
##    document_type      N Monto_Total Monto_Promedio Monto_Mediana
##    <chr>          <int>       <dbl>          <dbl>         <dbl>
##  1 RV            141669     3.28e12       23118025       6131832
##  2 DZ             85530    -2.66e12      -31103126         12000
##  3 NC             34420    -1.98e11       -5762334       -372867
##  4 DA             28093     1.31e11        4673336          7250
##  5 AB             16506    -5.75e 9        -348472         22591
##  6 ZV             16415    -2.83e11      -17227663      -4057161
##  7 RU             15260    -1.66e10       -1089410       -269124
##  8 CC              3039    -7.18e10      -23616576      -5425696
##  9 DR                62     1.43e 9       23022870       2565299
## 10 DG                18    -1.49e 8       -8256665      -2731239
## 11 SA                11     4.79e 8       43508945      53582052
## 12 ND                 4     2.97e 6         742010       1484019

# ─────────────────────────────────────────────────────────────────────────────
# VAR 5: Document_Date
# ─────────────────────────────────────────────────────────────────────────────
analizar_fecha(df, "document_date", 5)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 5: DOCUMENT_DATE  [FECHA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : Date 
##   Tipo analítico   : Variable de fecha/tiempo
## 
##   Registros válidos  : 341,027 
##   Faltantes          : 0 ( 0 %)
##   Fecha mínima       : 2022-01-03 
##   Fecha máxima       : 2026-03-28 
##   Rango en días      : 1,545 
##   Rango en años      : 4.2 
##   Fechas antes 2000  : 0 
##   Fechas > hoy+2años : 0 
## 
##   Registros por año:
## # A tibble: 5 × 2
##    Anio     n
##   <dbl> <int>
## 1  2022 89113
## 2  2023 86797
## 3  2024 73546
## 4  2025 69380
## 5  2026 22191
## 
##   Registros por mes (agregado de todos los años):
## # A tibble: 12 × 2
##    Mes            n
##    <ord>      <int>
##  1 enero      30024
##  2 febrero    34189
##  3 marzo      36199
##  4 abril      25342
##  5 mayo       28335
##  6 junio      27057
##  7 julio      26664
##  8 agosto     27734
##  9 septiembre 29484
## 10 octubre    25967
## 11 noviembre  24725
## 12 diciembre  25307

# ─────────────────────────────────────────────────────────────────────────────
# VAR 6: Payment_date
# ─────────────────────────────────────────────────────────────────────────────
analizar_fecha(df, "payment_date", 6)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 6: PAYMENT_DATE  [FECHA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : Date 
##   Tipo analítico   : Variable de fecha/tiempo
## 
##   Registros válidos  : 341,027 
##   Faltantes          : 0 ( 0 %)
##   Fecha mínima       : 2007-08-17 
##   Fecha máxima       : 4025-04-22 
##   Rango en días      : 7.369e+05 
##   Rango en años      : 2019 
##   Fechas antes 2000  : 0 
##   Fechas > hoy+2años : 7 
## 
##   Registros por año:
## # A tibble: 15 × 2
##     Anio     n
##    <dbl> <int>
##  1  2007     1
##  2  2013     4
##  3  2017     5
##  4  2018    17
##  5  2019     2
##  6  2020     1
##  7  2021    27
##  8  2022 86867
##  9  2023 86691
## 10  2024 73162
## 11  2025 69321
## 12  2026 24922
## 13  2202     3
## 14  2204     1
## 15  4025     3
## 
##   Registros por mes (agregado de todos los años):
## # A tibble: 12 × 2
##    Mes            n
##    <ord>      <int>
##  1 enero      25793
##  2 febrero    28972
##  3 marzo      35418
##  4 abril      31990
##  5 mayo       26957
##  6 junio      27870
##  7 julio      27566
##  8 agosto     27492
##  9 septiembre 28213
## 10 octubre    26906
## 11 noviembre  25521
## 12 diciembre  28329

cat("\n  ─ ANÁLISIS ADICIONAL ─────────────────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL ─────────────────────────────────────

# Diferencia entre document_date y payment_date
df_dif_pago <- df %>%
  filter(!is.na(document_date), !is.na(payment_date)) %>%
  mutate(dias_hasta_pago = as.numeric(payment_date - document_date))

cat("  Días desde emisión hasta pago:\n")

##   Días desde emisión hasta pago:

cat("    Media   :", round(mean(df_dif_pago$dias_hasta_pago,na.rm=TRUE),1), "\n")

##     Media   : 22.8

cat("    Mediana :", round(median(df_dif_pago$dias_hasta_pago,na.rm=TRUE),1), "\n")

##     Mediana : 3

cat("    Min     :", min(df_dif_pago$dias_hasta_pago,na.rm=TRUE), "\n")

##     Min     : -5705

cat("    Max     :", max(df_dif_pago$dias_hasta_pago,na.rm=TRUE), "\n")

##     Max     : 730534

cat("  Pagos antes de emisión (días negativos):",
    sum(df_dif_pago$dias_hasta_pago < 0, na.rm=TRUE), "\n")

##   Pagos antes de emisión (días negativos): 23132

# ─────────────────────────────────────────────────────────────────────────────
# VAR 7: Net_due_date
# ─────────────────────────────────────────────────────────────────────────────
analizar_fecha(df, "net_due_date", 7)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 7: NET_DUE_DATE  [FECHA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : Date 
##   Tipo analítico   : Variable de fecha/tiempo
## 
##   Registros válidos  : 341,027 
##   Faltantes          : 0 ( 0 %)
##   Fecha mínima       : 2007-08-17 
##   Fecha máxima       : 4025-04-22 
##   Rango en días      : 7.369e+05 
##   Rango en años      : 2019 
##   Fechas antes 2000  : 0 
##   Fechas > hoy+2años : 7 
## 
##   Registros por año:
## # A tibble: 15 × 2
##     Anio     n
##    <dbl> <int>
##  1  2007     1
##  2  2013     4
##  3  2017     5
##  4  2018    17
##  5  2019     2
##  6  2020     1
##  7  2021    26
##  8  2022 86089
##  9  2023 86387
## 10  2024 73565
## 11  2025 69338
## 12  2026 25585
## 13  2202     3
## 14  2204     1
## 15  4025     3
## 
##   Registros por mes (agregado de todos los años):
## # A tibble: 12 × 2
##    Mes            n
##    <ord>      <int>
##  1 enero      27446
##  2 febrero    26362
##  3 marzo      35394
##  4 abril      32121
##  5 mayo       27831
##  6 junio      27348
##  7 julio      27666
##  8 agosto     27621
##  9 septiembre 27787
## 10 octubre    27320
## 11 noviembre  25558
## 12 diciembre  28573

cat("\n  ─ ANÁLISIS ADICIONAL ─────────────────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL ─────────────────────────────────────

# Inconsistencias lógicas
n_fecha_inv <- df %>%
  filter(!is.na(document_date), !is.na(net_due_date)) %>%
  summarise(n = sum(net_due_date < document_date)) %>%
  pull(n)
cat("  Registros donde net_due_date < document_date (error lógico):",
    n_fecha_inv, "\n")

##   Registros donde net_due_date < document_date (error lógico): 17710

# ─────────────────────────────────────────────────────────────────────────────
# VAR 8: Arrears_after_net_due_date  →  arrears
# ─────────────────────────────────────────────────────────────────────────────
analizar_numerica(df, "arrears", 8, "Arrears after net due date (días de atraso)")

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 8: ARREARS  [NUMÉRICA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : numeric 
##   Tipo analítico   : Cuantitativa continua
## 
##   ─ CONTEO ─────────────────────────────────────────────
##   Registros válidos  : 341,027 
##   Valores faltantes  : 0 ( 0 %)
## 
##   ─ TENDENCIA CENTRAL ──────────────────────────────────
##   Media              : 15.94 
##   Mediana            : 6 
##   Moda               : 0 
## 
##   ─ DISPERSIÓN ─────────────────────────────────────────
##   Mínimo             : -2.831e+04 
##   Máximo             : 5,863 
##   Rango              : 3.417e+04 
##   Varianza           : 1.834e+04 
##   Desv. estándar     : 135.4 
##   Coef. variación    : 849.82% 
##   IQR (Q3-Q1)        : 21 
## 
##   ─ CUARTILES Y PERCENTILES ────────────────────────────
##   P5                 : -17 
##   P10                : -12 
##   Q1 (P25)           : 0 
##   Mediana (P50)      : 6 
##   Q3 (P75)           : 21 
##   P90                : 45 
##   P95                : 78 
##   P99                : 197 
## 
##   ─ FORMA DE LA DISTRIBUCIÓN ───────────────────────────
##   Asimetría          : -175.962 
##   Curtosis           : 36362.8 
##   Interpretación     : Sesgo negativo (cola izquierda): valores bajos extremos 
##   Curtosis interp.   : Leptocúrtica: colas pesadas, más outliers que normal 
## 
##   ─ OUTLIERS IQR ───────────────────────────────────────
##   Límite inferior IQR: -31.5 
##   Límite superior IQR: 52.5 
##   N° outliers        : 34,206 
##   % de outliers      : 10.03 %
##   Resumen outliers   : Min= -28308 | Max= 5863

cat("\n  ─ ANÁLISIS ADICIONAL ─────────────────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL ─────────────────────────────────────

cat("  Arrears ≤ 0 (pagó a tiempo o antes):",
    formatC(sum(df$arrears <= 0, na.rm=TRUE), big.mark=","), "\n")

##   Arrears ≤ 0 (pagó a tiempo o antes): 111,154

cat("  Arrears > 0 (pago tardío)          :",
    formatC(sum(df$arrears > 0,  na.rm=TRUE), big.mark=","), "\n")

##   Arrears > 0 (pago tardío)          : 229,873

cat("  Arrears negativos (adelanto)       :",
    formatC(sum(df$arrears < 0,  na.rm=TRUE), big.mark=","), "\n")

##   Arrears negativos (adelanto)       : 65,224

# Distribución por rangos de mora
tab_rango_arr <- df %>%
  filter(!is.na(arrears)) %>%
  mutate(Rango = case_when(
    arrears <   0  ~ "Negativo (adelanto)",
    arrears ==  0  ~ "Cero (exacto)",
    arrears <= 30  ~ "1 - 30 días",
    arrears <= 60  ~ "31 - 60 días",
    arrears <= 90  ~ "61 - 90 días",
    arrears <= 180 ~ "91 - 180 días",
    arrears <= 360 ~ "181 - 360 días",
    TRUE            ~ "Más de 360 días"
  ),
  Rango = factor(Rango, levels=c("Negativo (adelanto)","Cero (exacto)",
    "1 - 30 días","31 - 60 días","61 - 90 días",
    "91 - 180 días","181 - 360 días","Más de 360 días"))) %>%
  count(Rango) %>%
  mutate(Pct=round(n/sum(n)*100,2))
cat("\n  Distribución por rangos de atraso:\n")

## 
##   Distribución por rangos de atraso:

print(tab_rango_arr, n=Inf)

## # A tibble: 8 × 3
##   Rango                    n   Pct
##   <fct>                <int> <dbl>
## 1 Negativo (adelanto)  65224 19.1 
## 2 Cero (exacto)        45930 13.5 
## 3 1 - 30 días         173943 51.0 
## 4 31 - 60 días         32091  9.41
## 5 61 - 90 días          9629  2.82
## 6 91 - 180 días         9867  2.89
## 7 181 - 360 días        3432  1.01
## 8 Más de 360 días        911  0.27

colores_arr <- c("Negativo (adelanto)"=C4, "Cero (exacto)"="#A8D5A2",
                 "1 - 30 días"=C3, "31 - 60 días"="#E8A930",
                 "61 - 90 días"="#E07020", "91 - 180 días"=C2,
                 "181 - 360 días"="#A00000", "Más de 360 días"="#5B0000")

p_arr_rango <- ggplot(tab_rango_arr, aes(x=Rango, y=n, fill=Rango)) +
  geom_col(alpha=0.9, show.legend=FALSE) +
  geom_text(aes(label=paste0(Pct,"%")), vjust=-0.3, size=3.5) +
  scale_fill_manual(values=colores_arr) +
  scale_y_continuous(labels=comma, expand=expansion(mult=c(0,.12))) +
  labs(title="Distribución por rangos de días de atraso",
       x=NULL, y="Registros") +
  tema + theme(axis.text.x=element_text(angle=35,hjust=1))
print(p_arr_rango)

# ─────────────────────────────────────────────────────────────────────────────
# VAR 9: Amount_in_local_currency  →  amount
# ─────────────────────────────────────────────────────────────────────────────
analizar_numerica(df, "amount", 9, "Amount in local currency (monto)")

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 9: AMOUNT  [NUMÉRICA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : numeric 
##   Tipo analítico   : Cuantitativa continua
## 
##   ─ CONTEO ─────────────────────────────────────────────
##   Registros válidos  : 341,027 
##   Valores faltantes  : 0 ( 0 %)
## 
##   ─ TENDENCIA CENTRAL ──────────────────────────────────
##   Media              : 5.062e+05 
##   Mediana            : 2.899e+05 
##   Moda               : 0 
## 
##   ─ DISPERSIÓN ─────────────────────────────────────────
##   Mínimo             : -2.08e+10 
##   Máximo             : 2.08e+10 
##   Rango              : 4.16e+10 
##   Varianza           : 2.2e+16 
##   Desv. estándar     : 1.483e+08 
##   Coef. variación    : 29300.2% 
##   IQR (Q3-Q1)        : 6.399e+06 
## 
##   ─ CUARTILES Y PERCENTILES ────────────────────────────
##   P5                 : -3.146e+07 
##   P10                : -9.21e+06 
##   Q1 (P25)           : -4.592e+05 
##   Mediana (P50)      : 2.899e+05 
##   Q3 (P75)           : 5.94e+06 
##   P90                : 2.628e+07 
##   P95                : 5.276e+07 
##   P99                : 1.864e+08 
## 
##   ─ FORMA DE LA DISTRIBUCIÓN ───────────────────────────
##   Asimetría          : -7.993 
##   Curtosis           : 5362.047 
##   Interpretación     : Sesgo negativo (cola izquierda): valores bajos extremos 
##   Curtosis interp.   : Leptocúrtica: colas pesadas, más outliers que normal 
## 
##   ─ OUTLIERS IQR ───────────────────────────────────────
##   Límite inferior IQR: -1.006e+07 
##   Límite superior IQR: 1.554e+07 
##   N° outliers        : 83,272 
##   % de outliers      : 24.42 %
##   Resumen outliers   : Min= -20800206706 | Max= 20800206706

cat("\n  ─ ANÁLISIS ADICIONAL ─────────────────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL ─────────────────────────────────────

n_neg <- sum(df$amount < 0, na.rm=TRUE)
n_pos <- sum(df$amount > 0, na.rm=TRUE)
n_cer <- sum(df$amount == 0, na.rm=TRUE)
cat("  Montos positivos   :", formatC(n_pos, big.mark=","),
    "(", round(n_pos/nrow(df)*100,2), "%)\n")

##   Montos positivos   : 214,710 ( 62.96 %)

cat("  Montos negativos   :", formatC(n_neg, big.mark=","),
    "(", round(n_neg/nrow(df)*100,2), "%) → notas crédito / abonos\n")

##   Montos negativos   : 124,475 ( 36.5 %) → notas crédito / abonos

cat("  Montos = 0         :", formatC(n_cer, big.mark=","), "\n")

##   Montos = 0         : 1,842

cat("  Monto total neto   :", formatC(round(sum(df$amount,na.rm=TRUE),0),
                                       big.mark=","), "\n")

##   Monto total neto   : 1.726e+11

# Monto por estado de cartera
tab_monto_estado <- df %>% filter(!is.na(amount)) %>%
  group_by(estado_cartera) %>%
  summarise(
    N=n(),
    Monto_Total =round(sum(amount,na.rm=TRUE),0),
    Monto_Prom  =round(mean(amount,na.rm=TRUE),0),
    Monto_Med   =round(median(amount,na.rm=TRUE),0),
    .groups="drop"
  )
cat("\n  Monto por estado de cartera:\n")

## 
##   Monto por estado de cartera:

print(tab_monto_estado)

## # A tibble: 2 × 5
##   estado_cartera      N  Monto_Total Monto_Prom Monto_Med
##   <chr>           <int>        <dbl>      <dbl>     <dbl>
## 1 Abierta          6075 114448362999   18839237   2097018
## 2 Cerrada        334952  58178418073     173692    269892

p_monto_estado <- ggplot(
  df %>% filter(!is.na(amount), !is.na(estado_cartera)),
  aes(x=estado_cartera, y=amount, fill=estado_cartera)) +
  geom_boxplot(alpha=0.7, outlier.alpha=0.08, show.legend=FALSE) +
  scale_fill_manual(values=c("Abierta"=C2, "Cerrada"=C4)) +
  scale_y_continuous(labels=comma) +
  labs(title="Distribución de monto por estado de cartera",
       x=NULL, y="Monto") +
  tema
print(p_monto_estado)

# ─────────────────────────────────────────────────────────────────────────────
# VAR 10: Reason_code
# ─────────────────────────────────────────────────────────────────────────────
analizar_categorica(df, "reason_code", 10, top_n=15)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 10: REASON_CODE  [CATEGÓRICA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : character 
##   Tipo analítico   : Cualitativa nominal/ordinal
## 
##   Registros totales  : 341,027 
##   Valores faltantes  : 305226 ( 89.5 %)
##   Categorías únicas  : 18 
##   Categoría más frec.: 50 → 11999 registros
##   Categoría menos fr.: 61 → 3 registros
## 
##   Tabla de frecuencias (top 15 ):
## # A tibble: 15 × 4
##    Categoria Freq_Abs Freq_Rel Freq_Acum
##    <chr>        <int>    <dbl>     <dbl>
##  1 50           11999    33.5       33.5
##  2 81            9942    27.8       61.3
##  3 62            7207    20.1       81.4
##  4 21            2171     6.06      87.5
##  5 403           2037     5.69      93.2
##  6 12             700     1.96      95.1
##  7 59             495     1.38      96.5
##  8 76             343     0.96      97.5
##  9 44             247     0.69      98.2
## 10 72             208     0.58      98.7
## 11 71             198     0.55      99.3
## 12 22              72     0.2       99.5
## 13 70              59     0.16      99.6
## 14 90              58     0.16      99.8
## 15 31              55     0.15     100.0
## 
##   Categorías raras (<1% del total): 11

cat("\n  ─ ANÁLISIS ADICIONAL ─────────────────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL ─────────────────────────────────────

# Monto promedio por reason_code
tab_rc_monto <- df %>% filter(!is.na(amount)) %>%
  group_by(reason_code) %>%
  summarise(N=n(),
            Monto_Total=round(sum(amount,na.rm=TRUE),0),
            Monto_Prom=round(mean(amount,na.rm=TRUE),0),
            .groups="drop") %>%
  arrange(desc(N)) %>% slice_head(n=10)
cat("  Monto por reason_code (top 10 más frecuentes):\n")

##   Monto por reason_code (top 10 más frecuentes):

print(tab_rc_monto)

## # A tibble: 10 × 4
##    reason_code      N   Monto_Total Monto_Prom
##    <chr>        <int>         <dbl>      <dbl>
##  1 <NA>        305226  169514303215     555373
##  2 50           11999 -109805154363   -9151192
##  3 81            9942  113246747190   11390741
##  4 62            7207    5186821008     719692
##  5 21            2171   -1004290526    -462594
##  6 403           2037  -11481240591   -5636348
##  7 12             700     151448452     216355
##  8 59             495     618991409    1250488
##  9 76             343    7998226778   23318445
## 10 44             247   -1941621185   -7860815

# ─────────────────────────────────────────────────────────────────────────────
# VAR 11: Clearing_date
# ─────────────────────────────────────────────────────────────────────────────
analizar_fecha(df, "clearing_date", 11)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 11: CLEARING_DATE  [FECHA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : Date 
##   Tipo analítico   : Variable de fecha/tiempo
## 
##   Registros válidos  : 329,053 
##   Faltantes          : 11,974 ( 3.51 %)
##   Fecha mínima       : 2022-01-03 
##   Fecha máxima       : 2026-04-01 
##   Rango en días      : 1,549 
##   Rango en años      : 4.2 
##   Fechas antes 2000  : 0 
##   Fechas > hoy+2años : 0 
## 
##   Registros por año:
## # A tibble: 5 × 2
##    Anio     n
##   <dbl> <int>
## 1  2022 83077
## 2  2023 86560
## 3  2024 73436
## 4  2025 69649
## 5  2026 16331
## 
##   Registros por mes (agregado de todos los años):
## # A tibble: 12 × 2
##    Mes            n
##    <ord>      <int>
##  1 enero      23892
##  2 febrero    25337
##  3 marzo      33067
##  4 abril      25995
##  5 mayo       25749
##  6 junio      29097
##  7 julio      27245
##  8 agosto     25996
##  9 septiembre 31602
## 10 octubre    25914
## 11 noviembre  24521
## 12 diciembre  30638

cat("\n  ─ ANÁLISIS ADICIONAL ─────────────────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL ─────────────────────────────────────

# Clearing_date NA según estado de cartera
tab_clear_estado <- df %>%
  mutate(clearing_na = is.na(clearing_date)) %>%
  count(estado_cartera, clearing_na) %>%
  mutate(Pct=round(n/sum(n)*100,2))
cat("  Faltantes en clearing_date por estado de cartera:\n")

##   Faltantes en clearing_date por estado de cartera:

print(tab_clear_estado)

## # A tibble: 4 × 4
##   estado_cartera clearing_na      n   Pct
##   <chr>          <lgl>        <int> <dbl>
## 1 Abierta        FALSE           88  0.03
## 2 Abierta        TRUE          5987  1.76
## 3 Cerrada        FALSE       328965 96.5 
## 4 Cerrada        TRUE          5987  1.76

cat("\n  NOTA: clearing_date nula en 'Abierta' es ESPERADO\n")

## 
##   NOTA: clearing_date nula en 'Abierta' es ESPERADO

cat("  (no se ha compensado). Solo es problema si es 'Cerrada'.\n")

##   (no se ha compensado). Solo es problema si es 'Cerrada'.

n_cerrada_sin_clearing <- df %>%
  filter(estado_cartera == "Cerrada", is.na(clearing_date)) %>% nrow()
cat("  Registros Cerrados sin clearing_date:", n_cerrada_sin_clearing, "\n")

##   Registros Cerrados sin clearing_date: 5987

# ─────────────────────────────────────────────────────────────────────────────
# VAR 12: Year/month  →  year_month
# ─────────────────────────────────────────────────────────────────────────────
separador("VARIABLE 12: year_month  [CATEGÓRICA TEMPORAL]")

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 12: YEAR_MONTH  [CATEGÓRICA TEMPORAL]
## ═════════════════════════════════════════════════════════════════

cat("  Tipo en R        :", class(df$year_month)[1], "\n")

##   Tipo en R        : character

cat("  Tipo analítico   : Categórica de periodo (Año/Mes)\n\n")

##   Tipo analítico   : Categórica de periodo (Año/Mes)

n_na_ym   <- sum(is.na(df$year_month) | df$year_month == "")
n_cat_ym  <- n_distinct(df$year_month, na.rm=TRUE)
cat("  Períodos únicos  :", n_cat_ym, "\n")

##   Períodos únicos  : 51

cat("  Faltantes        :", n_na_ym, "\n")

##   Faltantes        : 0

tab_ym <- df %>%
  filter(!is.na(year_month), year_month != "") %>%
  count(year_month, name="Registros") %>%
  arrange(year_month) %>%
  mutate(Pct=round(Registros/sum(Registros)*100,2))
cat("\n  Registros por período (Year/Month):\n")

## 
##   Registros por período (Year/Month):

print(tab_ym, n=Inf)

## # A tibble: 51 × 3
##    year_month Registros   Pct
##    <chr>          <int> <dbl>
##  1 2022/01         6223  1.82
##  2 2022/02         6808  2   
##  3 2022/03         8231  2.41
##  4 2022/04         6659  1.95
##  5 2022/05         7945  2.33
##  6 2022/06         8793  2.58
##  7 2022/07         7059  2.07
##  8 2022/08         7134  2.09
##  9 2022/09         8688  2.55
## 10 2022/10         6719  1.97
## 11 2022/11         6701  1.96
## 12 2022/12         8096  2.37
## 13 2023/01         5912  1.73
## 14 2023/02         6596  1.93
## 15 2023/03         7900  2.32
## 16 2023/04         6223  1.82
## 17 2023/05         7110  2.08
## 18 2023/06         8215  2.41
## 19 2023/07         6714  1.97
## 20 2023/08         6916  2.03
## 21 2023/09         8950  2.62
## 22 2023/10         6817  2   
## 23 2023/11         6546  1.92
## 24 2023/12         8671  2.54
## 25 2024/01         5592  1.64
## 26 2024/02         6823  2   
## 27 2024/03         7174  2.1 
## 28 2024/04         5998  1.76
## 29 2024/05         6010  1.76
## 30 2024/06         7274  2.13
## 31 2024/07         5256  1.54
## 32 2024/08         5428  1.59
## 33 2024/09         7285  2.14
## 34 2024/10         5514  1.62
## 35 2024/11         4888  1.43
## 36 2024/12         6078  1.78
## 37 2025/01         4327  1.27
## 38 2025/02         5476  1.61
## 39 2025/03         6962  2.04
## 40 2025/04         5053  1.48
## 41 2025/05         5193  1.52
## 42 2025/06         6608  1.94
## 43 2025/07         5629  1.65
## 44 2025/08         5847  1.71
## 45 2025/09         7330  2.15
## 46 2025/10         5567  1.63
## 47 2025/11         5129  1.5 
## 48 2025/12         6482  1.9 
## 49 2026/01         5075  1.49
## 50 2026/02         6653  1.95
## 51 2026/03        10750  3.15

# Gráfico: tendencia por período
p_ym <- ggplot(tab_ym, aes(x=year_month, y=Registros)) +
  geom_col(fill=C1, alpha=0.85) +
  scale_y_continuous(labels=comma) +
  labs(title="Registros por período Year/Month",
       x="Período", y="Registros") +
  tema + theme(axis.text.x=element_text(angle=60, hjust=1, size=7))
print(p_ym)

# Monto por período
tab_ym_monto <- df %>%
  filter(!is.na(year_month), year_month != "", !is.na(amount)) %>%
  group_by(year_month) %>%
  summarise(Monto_Total=sum(amount,na.rm=TRUE), .groups="drop") %>%
  arrange(year_month)

p_ym_monto <- ggplot(tab_ym_monto, aes(x=year_month, y=Monto_Total)) +
  geom_line(aes(group=1), color=C3, linewidth=1) +
  geom_point(color=C2, size=1.5) +
  scale_y_continuous(labels=comma) +
  labs(title="Monto total por período Year/Month",
       x="Período", y="Monto total") +
  tema + theme(axis.text.x=element_text(angle=60,hjust=1,size=7))
print(p_ym_monto)

# ─────────────────────────────────────────────────────────────────────────────
# VAR 13: Estado_Cartera
# ─────────────────────────────────────────────────────────────────────────────
analizar_categorica(df, "estado_cartera", 13, top_n=10)

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 13: ESTADO_CARTERA  [CATEGÓRICA]
## ═════════════════════════════════════════════════════════════════
## 
##   Tipo en R        : character 
##   Tipo analítico   : Cualitativa nominal/ordinal
## 
##   Registros totales  : 341,027 
##   Valores faltantes  : 0 ( 0 %)
##   Categorías únicas  : 2 
##   Categoría más frec.: Cerrada → 334952 registros
##   Categoría menos fr.: Abierta → 6075 registros
## 
##   Tabla de frecuencias (top 2 ):
## # A tibble: 2 × 4
##   Categoria Freq_Abs Freq_Rel Freq_Acum
##   <chr>        <int>    <dbl>     <dbl>
## 1 Cerrada     334952    98.2       98.2
## 2 Abierta       6075     1.78     100  
## 
##   Categorías raras (<1% del total): 0

cat("\n  ─ ANÁLISIS ADICIONAL ─────────────────────────────────────\n")

## 
##   ─ ANÁLISIS ADICIONAL ─────────────────────────────────────

# Estado por tipo de documento
tab_estado_tipo <- df %>%
  count(estado_cartera, document_type) %>%
  arrange(estado_cartera, desc(n))
cat("  Estado de cartera por tipo de documento:\n")

##   Estado de cartera por tipo de documento:

print(tab_estado_tipo)

## # A tibble: 19 × 3
##    estado_cartera document_type      n
##    <chr>          <chr>          <int>
##  1 Abierta        RV              3695
##  2 Abierta        DZ               878
##  3 Abierta        DA               552
##  4 Abierta        RU               477
##  5 Abierta        NC               249
##  6 Abierta        ZV               222
##  7 Abierta        AB                 2
##  8 Cerrada        RV            137974
##  9 Cerrada        DZ             84652
## 10 Cerrada        NC             34171
## 11 Cerrada        DA             27541
## 12 Cerrada        AB             16504
## 13 Cerrada        ZV             16193
## 14 Cerrada        RU             14783
## 15 Cerrada        CC              3039
## 16 Cerrada        DR                62
## 17 Cerrada        DG                18
## 18 Cerrada        SA                11
## 19 Cerrada        ND                 4

# Monto total abierta vs cerrada
tab_estado_monto <- df %>% filter(!is.na(amount)) %>%
  group_by(estado_cartera) %>%
  summarise(Monto_Total=round(sum(amount,na.rm=TRUE),0),
            N=n(), .groups="drop") %>%
  mutate(Pct_Monto=round(Monto_Total/sum(Monto_Total)*100,2))
cat("\n  Monto total por estado de cartera:\n")

## 
##   Monto total por estado de cartera:

print(tab_estado_monto)

## # A tibble: 2 × 4
##   estado_cartera  Monto_Total      N Pct_Monto
##   <chr>                 <dbl>  <int>     <dbl>
## 1 Abierta        114448362999   6075      66.3
## 2 Cerrada         58178418073 334952      33.7

p_estado_monto <- ggplot(
  df %>% filter(!is.na(estado_cartera), !is.na(amount)),
  aes(x=estado_cartera, y=amount, fill=estado_cartera)) +
  geom_boxplot(alpha=0.7, outlier.alpha=0.1, show.legend=FALSE) +
  scale_fill_manual(values=c("Abierta"=C2, "Cerrada"=C4)) +
  scale_y_continuous(labels=comma) +
  labs(title="Monto por estado de cartera",
       subtitle="Abierta = pendiente de cobro  |  Cerrada = compensada",
       x=NULL, y="Monto") +
  tema
print(p_estado_monto)

# ─────────────────────────────────────────────────────────────────────────────
# VAR 14: Bucket_Mora
# ─────────────────────────────────────────────────────────────────────────────
separador("VARIABLE 14: bucket_mora  [CATEGÓRICA ORDINAL]")

## 
## ═════════════════════════════════════════════════════════════════
##  VARIABLE 14: BUCKET_MORA  [CATEGÓRICA ORDINAL]
## ═════════════════════════════════════════════════════════════════

cat("  Tipo en R        :", class(df$bucket_mora)[1], "\n")

##   Tipo en R        : ordered

cat("  Tipo analítico   : Cualitativa ordinal (tramos de mora ordenados)\n")

##   Tipo analítico   : Cualitativa ordinal (tramos de mora ordenados)

cat("  Orden de niveles : Al dia < 1-30 < 31-60 < 61-90 < 91-180 < 181-360 < >360\n\n")

##   Orden de niveles : Al dia < 1-30 < 31-60 < 61-90 < 91-180 < 181-360 < >360

n_na_bm  <- sum(is.na(df$bucket_mora))
tab_bm <- df %>%
  filter(!is.na(bucket_mora)) %>%
  count(bucket_mora, name="Freq_Abs") %>%
  mutate(Freq_Rel  = round(Freq_Abs/sum(Freq_Abs)*100,2),
         Freq_Acum = cumsum(Freq_Rel))

cat("  Faltantes        :", n_na_bm, "\n")

##   Faltantes        : 0

cat("  Categorías       :", n_distinct(df$bucket_mora, na.rm=TRUE), "\n")

##   Categorías       : 7

cat("  Bucket más frec. :", as.character(tab_bm$bucket_mora[1]), "\n\n")

##   Bucket más frec. : Al dia

cat("  Tabla de frecuencias:\n")

##   Tabla de frecuencias:

print(tab_bm, n=Inf)

## # A tibble: 7 × 4
##   bucket_mora Freq_Abs Freq_Rel Freq_Acum
##   <ord>          <int>    <dbl>     <dbl>
## 1 Al dia        111154    32.6       32.6
## 2 1-30          173943    51.0       83.6
## 3 31-60          32091     9.41      93.0
## 4 61-90           9629     2.82      95.8
## 5 91-180          9867     2.89      98.7
## 6 181-360         3432     1.01      99.7
## 7 >360             911     0.27     100

# Semáforo de salud de cartera
pct_al_dia   <- tab_bm %>% filter(bucket_mora == "Al dia")   %>% pull(Freq_Rel)
pct_critico  <- tab_bm %>% filter(bucket_mora %in% c("91-180","181-360",">360")) %>%
  summarise(pct=sum(Freq_Rel)) %>% pull(pct)
cat("\n  ─ SEMÁFORO DE SALUD DE CARTERA ──────────────────────────\n")

## 
##   ─ SEMÁFORO DE SALUD DE CARTERA ──────────────────────────

cat("  % Al día               :", if(length(pct_al_dia)==0) 0 else pct_al_dia, "%\n")

##   % Al día               : 32.59 %

cat("  % Mora crítica (>90d)  :", pct_critico, "%\n")

##   % Mora crítica (>90d)  : 4.17 %

cat("  Diagnóstico            :", case_when(
  pct_al_dia > 80 ~ "✓ Cartera SANA — mayoría al día",
  pct_al_dia > 60 ~ "▲ Cartera MODERADA — seguimiento recomendado",
  TRUE             ~ "✖ Cartera DETERIORADA — gestión urgente requerida"
), "\n")

##   Diagnóstico            : ✖ Cartera DETERIORADA — gestión urgente requerida

# Monto por bucket
tab_bm_monto <- df %>% filter(!is.na(amount), !is.na(bucket_mora)) %>%
  group_by(bucket_mora) %>%
  summarise(
    N=n(),
    Monto_Total=round(sum(amount,na.rm=TRUE),0),
    Monto_Prom =round(mean(amount,na.rm=TRUE),0),
    Monto_Med  =round(median(amount,na.rm=TRUE),0),
    .groups="drop"
  )
cat("\n  Monto por tramo de mora:\n")

## 
##   Monto por tramo de mora:

print(tab_bm_monto, n=Inf)

## # A tibble: 7 × 5
##   bucket_mora      N   Monto_Total Monto_Prom Monto_Med
##   <ord>        <int>         <dbl>      <dbl>     <dbl>
## 1 Al dia      111154  570037689212    5128360   2065602
## 2 1-30        173943 -418469992452   -2405788    209260
## 3 31-60        32091    6319537450     196926     41055
## 4 61-90         9629    9721353862    1009591     32249
## 5 91-180        9867    -106352134     -10779     23600
## 6 181-360       3432    -846591818    -246676     27251
## 7 >360           911    5971136951    6554486    -60000

# Gráficos bucket
colores_bm <- c("Al dia"=C4,"1-30"="#A8D5A2","31-60"=C3,
                "61-90"="#E8A930","91-180"="#E07020",
                "181-360"=C2,">360"="#8B0000")

p_bm_n <- ggplot(tab_bm, aes(x=bucket_mora, y=Freq_Abs, fill=bucket_mora)) +
  geom_col(alpha=0.9, show.legend=FALSE) +
  geom_text(aes(label=paste0(Freq_Rel,"%")), vjust=-0.3, size=3.5) +
  scale_fill_manual(values=colores_bm) +
  scale_y_continuous(labels=comma, expand=expansion(mult=c(0,.12))) +
  labs(title="Frecuencia por tramo de mora", x="Bucket", y="Registros") +
  tema

p_bm_m <- ggplot(tab_bm_monto, aes(x=bucket_mora, y=Monto_Total, fill=bucket_mora)) +
  geom_col(alpha=0.9, show.legend=FALSE) +
  scale_fill_manual(values=colores_bm) +
  scale_y_continuous(labels=comma) +
  labs(title="Monto total por tramo de mora", x="Bucket", y="Monto total") +
  tema

panel_bm <- p_bm_n | p_bm_m
print(panel_bm + plot_annotation(
  title = "Análisis: Bucket de Mora",
  theme = theme(plot.title=element_text(face="bold",size=14,color=C1))
))

# ═══════════════════════════════════════════════════════════════════════════════
# PARTE 3 ▸ CONCLUSIÓN GENERAL AUTOMÁTICA
# ═══════════════════════════════════════════════════════════════════════════════

separador("PARTE 3 — CONCLUSIÓN GENERAL AUTOMÁTICA")

## 
## ═════════════════════════════════════════════════════════════════
##  PARTE 3 — CONCLUSIÓN GENERAL AUTOMÁTICA
## ═════════════════════════════════════════════════════════════════

# Calcular insumos para la conclusión
pct_clear_na   <- round(sum(is.na(df$clearing_date)) / nrow(df) * 100, 1)
pct_payment_na <- round(sum(is.na(df$payment_date))  / nrow(df) * 100, 1)
n_out_arr      <- length(outliers_iqr(df$arrears))
n_out_amt      <- length(outliers_iqr(df$amount))
pct_out_arr    <- round(n_out_arr / sum(!is.na(df$arrears)) * 100, 1)
pct_out_amt    <- round(n_out_amt / sum(!is.na(df$amount))  * 100, 1)
n_clientes     <- n_distinct(df$anon_customer_id)
n_docs         <- n_distinct(df$anon_document_id)
pct_mora_crit  <- tab_bm %>%
  filter(bucket_mora %in% c("91-180","181-360",">360")) %>%
  summarise(s=sum(Freq_Rel)) %>% pull(s)
pct_al_dia_v   <- tab_bm %>% filter(bucket_mora=="Al dia") %>%
  pull(Freq_Rel) %>% {if(length(.)==0) 0 else .}

conclusion <- paste0(
"
╔══════════════════════════════════════════════════════════════════╗
║         CONCLUSIÓN GENERAL DEL ANÁLISIS EXPLORATORIO            ║
╚══════════════════════════════════════════════════════════════════╝

Generado automáticamente el: ", format(Sys.time(), "%d/%m/%Y %H:%M"), "

─── ESTRUCTURA DE LA BASE ───────────────────────────────────────────
  • La base contiene ", formatC(nrow(df), big.mark=","),
  " registros y 14 variables (se eliminó Pago_Oportuno_Bin).
  • Representa ", formatC(n_clientes, big.mark=","), " clientes únicos y ",
  formatC(n_docs, big.mark=","), " documentos únicos.
  • Se identificaron variables de 4 tipos:
    - 2 Identificadores: Anon_Customer_ID, Anon_Document_ID
    - 4 Fechas         : Document_Date, Payment_date, Net_due_date, Clearing_date
    - 2 Numéricas      : Arrears_after_net_due_date, Amount_in_local_currency
    - 6 Categóricas    : Terms_of_Payment, Document_Type, Reason_code,
                         Year/month, Estado_Cartera, Bucket_Mora

─── CALIDAD DE DATOS ────────────────────────────────────────────────
  • Faltantes críticos:
    - clearing_date : ", pct_clear_na,
  "% faltante — NORMAL en cartera abierta (sin compensar)
    - payment_date  : ", pct_payment_na,
  "% faltante — revisar si cartera cerrada sin fecha de pago
  • Terms_of_Payment tiene valores vacíos ('') en parte de los registros.
    Posiblemente corresponde a abonos (AB) o documentos sin condición.
  • Bucket_Mora y Pago_Oportuno_Bin venían como fórmulas IF sin calcular
    en Excel. Se reconstruyeron desde Arrears en este script.
  • Duplicados por fila completa : ", n_dup_filas, "
  • Document IDs con duplicados  : ", n_dup_docs, "

─── VARIABLES NUMÉRICAS ─────────────────────────────────────────────
  • Arrears (días de atraso):
    - Media ", round(mean(df$arrears,na.rm=TRUE),1), " días | Mediana ",
  round(median(df$arrears,na.rm=TRUE),1), " días
    - Valores negativos = pagos anticipados (comportamiento válido)
    - Outliers IQR: ", formatC(n_out_arr,big.mark=","),
  " registros (", pct_out_arr, "%)
    - Distribución fuertemente sesgada a la derecha: muchos registros
      en mora muy alta (>360 días)
  • Amount (monto):
    - Montos negativos = notas crédito / abonos contables (válidos)
    - Outliers IQR: ", formatC(n_out_amt,big.mark=","),
  " registros (", pct_out_amt, "%)
    - Alta variabilidad entre clientes: revisar concentración

─── VARIABLES CATEGÓRICAS ───────────────────────────────────────────
  • Document_Type: identificar cuáles son facturas (ej: DA) vs abonos
    (AB) es clave para análisis de cartera bruta vs neta.
  • Estado_Cartera: discrimina documentos activos vs compensados.
  • Bucket_Mora: indicador clave de salud de cartera.
    - % Al día            : ", pct_al_dia_v, "%
    - % Mora crítica >90d : ", pct_mora_crit, "%
    - Diagnóstico: ", case_when(
      pct_al_dia_v > 80 ~ "Cartera SANA",
      pct_al_dia_v > 60 ~ "Cartera MODERADA — seguimiento necesario",
      TRUE               ~ "Cartera DETERIORADA — gestión urgente"
    ), "

─── VARIABLES FECHA ─────────────────────────────────────────────────
  • Document_Date cubre un rango amplio de años: revisar registros
    extremos que podrían ser errores de carga.
  • Inconsistencias: revisar registros donde net_due_date < document_date.
  • Clearing_date nula en cartera abierta es estructural, no un error.

─── RECOMENDACIONES PRIORITARIAS ────────────────────────────────────
  1. Recalcular Bucket_Mora directamente desde Arrears (ya hecho aquí).
  2. Separar montos negativos (notas crédito) para análisis de saldo neto.
  3. Crear flag de outliers en Amount y Arrears para modelos.
  4. Validar registros donde Document_Date > Net_due_date (error lógico).
  5. Investigar registros Cerrados sin Clearing_date (", n_cerrada_sin_clearing, " casos).
  6. Evaluar concentración de cartera: aplicar regla 80/20 por cliente.
  7. Revisar Terms_of_Payment vacíos por tipo de documento.

══════════════════════════════════════════════════════════════════════
")

cat(conclusion)

## 
## ╔══════════════════════════════════════════════════════════════════╗
## ║         CONCLUSIÓN GENERAL DEL ANÁLISIS EXPLORATORIO            ║
## ╚══════════════════════════════════════════════════════════════════╝
## 
## Generado automáticamente el: 10/04/2026 03:23
## 
## ─── ESTRUCTURA DE LA BASE ───────────────────────────────────────────
##   • La base contiene 341,027 registros y 14 variables (se eliminó Pago_Oportuno_Bin).
##   • Representa 319 clientes únicos y 10,000 documentos únicos.
##   • Se identificaron variables de 4 tipos:
##     - 2 Identificadores: Anon_Customer_ID, Anon_Document_ID
##     - 4 Fechas         : Document_Date, Payment_date, Net_due_date, Clearing_date
##     - 2 Numéricas      : Arrears_after_net_due_date, Amount_in_local_currency
##     - 6 Categóricas    : Terms_of_Payment, Document_Type, Reason_code,
##                          Year/month, Estado_Cartera, Bucket_Mora
## 
## ─── CALIDAD DE DATOS ────────────────────────────────────────────────
##   • Faltantes críticos:
##     - clearing_date : 3.5% faltante — NORMAL en cartera abierta (sin compensar)
##     - payment_date  : 0% faltante — revisar si cartera cerrada sin fecha de pago
##   • Terms_of_Payment tiene valores vacíos ('') en parte de los registros.
##     Posiblemente corresponde a abonos (AB) o documentos sin condición.
##   • Bucket_Mora y Pago_Oportuno_Bin venían como fórmulas IF sin calcular
##     en Excel. Se reconstruyeron desde Arrears en este script.
##   • Duplicados por fila completa : 3405
##   • Document IDs con duplicados  : 10000
## 
## ─── VARIABLES NUMÉRICAS ─────────────────────────────────────────────
##   • Arrears (días de atraso):
##     - Media 15.9 días | Mediana 6 días
##     - Valores negativos = pagos anticipados (comportamiento válido)
##     - Outliers IQR: 34,206 registros (10%)
##     - Distribución fuertemente sesgada a la derecha: muchos registros
##       en mora muy alta (>360 días)
##   • Amount (monto):
##     - Montos negativos = notas crédito / abonos contables (válidos)
##     - Outliers IQR: 83,272 registros (24.4%)
##     - Alta variabilidad entre clientes: revisar concentración
## 
## ─── VARIABLES CATEGÓRICAS ───────────────────────────────────────────
##   • Document_Type: identificar cuáles son facturas (ej: DA) vs abonos
##     (AB) es clave para análisis de cartera bruta vs neta.
##   • Estado_Cartera: discrimina documentos activos vs compensados.
##   • Bucket_Mora: indicador clave de salud de cartera.
##     - % Al día            : 32.59%
##     - % Mora crítica >90d : 4.17%
##     - Diagnóstico: Cartera DETERIORADA — gestión urgente
## 
## ─── VARIABLES FECHA ─────────────────────────────────────────────────
##   • Document_Date cubre un rango amplio de años: revisar registros
##     extremos que podrían ser errores de carga.
##   • Inconsistencias: revisar registros donde net_due_date < document_date.
##   • Clearing_date nula en cartera abierta es estructural, no un error.
## 
## ─── RECOMENDACIONES PRIORITARIAS ────────────────────────────────────
##   1. Recalcular Bucket_Mora directamente desde Arrears (ya hecho aquí).
##   2. Separar montos negativos (notas crédito) para análisis de saldo neto.
##   3. Crear flag de outliers en Amount y Arrears para modelos.
##   4. Validar registros donde Document_Date > Net_due_date (error lógico).
##   5. Investigar registros Cerrados sin Clearing_date (5987 casos).
##   6. Evaluar concentración de cartera: aplicar regla 80/20 por cliente.
##   7. Revisar Terms_of_Payment vacíos por tipo de documento.
## 
## ══════════════════════════════════════════════════════════════════════

# ═══════════════════════════════════════════════════════════════════════════════
# FIN DEL SCRIPT
# ═══════════════════════════════════════════════════════════════════════════════
cat("\n✅  ANÁLISIS COMPLETADO\n")

## 
## ✅  ANÁLISIS COMPLETADO

cat("    Revisa los gráficos generados en el panel de RStudio (Plots).\n")

##     Revisa los gráficos generados en el panel de RStudio (Plots).

cat("    Para exportar gráficos, usa ggsave() o el botón Export en Plots.\n\n")

##     Para exportar gráficos, usa ggsave() o el botón Export en Plots.

# ═══════════════════════════════════════════════════════════════════════════════
#  EDA AVANZADO — GRÁFICOS COMPLEMENTARIOS  |  BASE I2C / CARTERA
#  Complementa: EDA_I2C_Variable_por_Variable.R
#  Secciones:
#    1. Análisis bivariado
#    2. Análisis temporal
#    3. Correlación (heatmap)
#    4. Distribuciones avanzadas
#    5. Calidad de datos (nulos)
#    6. Segmentación por cliente
# ═══════════════════════════════════════════════════════════════════════════════
#
#  REQUISITO: ejecutar primero EDA_I2C_Variable_por_Variable.R
#  para tener el dataframe `df` en el entorno de R.
#  Si no lo tienes cargado, ejecuta el bloque de carga al final
#  de este script (marcado con "CARGA INDEPENDIENTE").
# ═══════════════════════════════════════════════════════════════════════════════


# ───────────────────────────────────────────────────────────────────────────────
# 0 ▸ LIBRERÍAS Y CONFIGURACIÓN VISUAL
# ───────────────────────────────────────────────────────────────────────────────

paquetes <- c("tidyverse", "readxl", "janitor", "ggplot2", "patchwork",
              "scales", "ggcorrplot", "moments", "lubridate")

for (pkg in paquetes) {
  if (!requireNamespace(pkg, quietly = TRUE))
    install.packages(pkg, repos = "https://cran.r-project.org")
  suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}

## Warning: package 'ggcorrplot' was built under R version 4.5.3

# ── Paleta y tema (idénticos al script principal) ─────────────────────────────
C1 <- "#2C3E7A"; C2 <- "#E84040"; C3 <- "#F5A623"; C4 <- "#27AE60"

PALETA_BUCKET <- c(
  "Al dia"   = C4,
  "1-30"     = "#85C17E",
  "31-60"    = C3,
  "61-90"    = "#E8A323",
  "91-180"   = "#D4691E",
  "181-360"  = C2,
  ">360"     = "#7B0000"
)

ORDEN_BUCKET <- c("Al dia","1-30","31-60","61-90","91-180","181-360",">360")

tema <- theme_minimal(base_size = 11) +
  theme(
    plot.title      = element_text(face = "bold", size = 13, color = C1),
    plot.subtitle   = element_text(size = 10, color = "gray40"),
    plot.caption    = element_text(size = 8,  color = "gray55"),
    axis.title      = element_text(size = 10),
    legend.position = "bottom",
    panel.grid.minor = element_blank()
  )

separador <- function(titulo) {
  cat("\n", strrep("═", 65), "\n", sep = "")
  cat("  ", toupper(titulo), "\n", sep = "")
  cat(strrep("═", 65), "\n\n", sep = "")
}

cat("✓ Configuración lista\n\n")

## ✓ Configuración lista

# ═══════════════════════════════════════════════════════════════════════════════
#  ▸ CARGA INDEPENDIENTE  (omitir si df ya está en el entorno)
# ═══════════════════════════════════════════════════════════════════════════════
# Descomenta este bloque si ejecutas este script de forma independiente:

# RUTA <- "C:/Users/jcabia01/Downloads/Tabla anonimización Ok.xlsx"
# df_raw <- read_excel(RUTA, sheet = 1, guess_max = 10000, col_types = "text")
# col_elim <- grep("pago|oportuno|Bin", names(df_raw), value=TRUE, ignore.case=TRUE)
# if (length(col_elim) > 0) df_raw <- df_raw %>% select(-all_of(col_elim))
# df <- df_raw %>%
#   clean_names() %>%
#   mutate(
#     anon_customer_id = as.character(anon_customer_id),
#     anon_document_id = as.character(anon_document_id),
#     reason_code      = as.character(reason_code),
#     terms_of_payment = as.character(terms_of_payment),
#     document_type    = as.character(document_type),
#     year_month       = as.character(year_month),
#     estado_cartera   = as.character(estado_cartera_abierta_cerrada),
#     arrears          = suppressWarnings(as.numeric(arrears_after_net_due_date)),
#     amount           = suppressWarnings(as.numeric(amount_in_local_currency)),
#     document_date    = suppressWarnings(as.Date(as.numeric(document_date),  origin="1899-12-30")),
#     payment_date     = suppressWarnings(as.Date(as.numeric(payment_date),   origin="1899-12-30")),
#     net_due_date     = suppressWarnings(as.Date(as.numeric(net_due_date),   origin="1899-12-30")),
#     clearing_date    = suppressWarnings(as.Date(as.numeric(clearing_date),  origin="1899-12-30")),
#     bucket_mora = factor(
#       case_when(
#         arrears <= 0   ~ "Al dia",  arrears <= 30  ~ "1-30",
#         arrears <= 60  ~ "31-60",   arrears <= 90  ~ "61-90",
#         arrears <= 180 ~ "91-180",  arrears <= 360 ~ "181-360",
#         TRUE           ~ ">360"),
#       levels = c("Al dia","1-30","31-60","61-90","91-180","181-360",">360"),
#       ordered = TRUE)
#   ) %>%
#   select(anon_customer_id, anon_document_id, terms_of_payment, document_type,
#          document_date, payment_date, net_due_date, arrears, amount,
#          reason_code, clearing_date, year_month, estado_cartera, bucket_mora)
# cat("✓ Datos cargados:", nrow(df), "filas x", ncol(df), "columnas\n")

# ── Verificar que df existe ────────────────────────────────────────────────────
if (!exists("df")) stop("⚠ El objeto 'df' no existe. Carga primero el script principal o descomenta el bloque CARGA INDEPENDIENTE.")

# ── Preparar datos de trabajo ─────────────────────────────────────────────────
# Datos sin outliers extremos de amount (para scatter/density legibles)
lim_amt <- list(
  inf = quantile(df$amount, .01, na.rm = TRUE),
  sup = quantile(df$amount, .99, na.rm = TRUE)
)
lim_arr <- list(
  inf = quantile(df$arrears, .01, na.rm = TRUE),
  sup = quantile(df$arrears, .99, na.rm = TRUE)
)

df_clean <- df %>%
  filter(!is.na(amount), !is.na(arrears),
         amount  >= lim_amt$inf, amount  <= lim_amt$sup,
         arrears >= lim_arr$inf, arrears <= lim_arr$sup)

df_bkt <- df %>%
  filter(!is.na(bucket_mora)) %>%
  mutate(bucket_mora = factor(bucket_mora, levels = ORDEN_BUCKET, ordered = TRUE))

cat("Registros totales           :", formatC(nrow(df), big.mark = ","), "\n")

## Registros totales           : 341,027

cat("Registros sin outliers (p1-p99):", formatC(nrow(df_clean), big.mark = ","), "\n\n")

## Registros sin outliers (p1-p99): 327,619

# ═══════════════════════════════════════════════════════════════════════════════
#  SECCIÓN 1 ▸ ANÁLISIS BIVARIADO
# ═══════════════════════════════════════════════════════════════════════════════

separador("SECCIÓN 1 — ANÁLISIS BIVARIADO")

## 
## ═════════════════════════════════════════════════════════════════
##   SECCIÓN 1 — ANÁLISIS BIVARIADO
## ═════════════════════════════════════════════════════════════════

# ── 1.1 Boxplot: Amount vs Bucket_Mora ────────────────────────────────────────
cat("  Generando 1.1: Boxplot Amount vs Bucket_Mora...\n")

##   Generando 1.1: Boxplot Amount vs Bucket_Mora...

# Estadísticas medianas para etiquetar
medians_amt <- df_bkt %>%
  filter(!is.na(amount)) %>%
  group_by(bucket_mora) %>%
  summarise(med = median(amount, na.rm = TRUE), .groups = "drop")

p1_1 <- df_bkt %>%
  filter(!is.na(amount),
         amount >= lim_amt$inf, amount <= lim_amt$sup) %>%
  ggplot(aes(x = bucket_mora, y = amount, fill = bucket_mora)) +
  geom_boxplot(alpha = 0.75, outlier.alpha = 0.08,
               outlier.size = 0.6, show.legend = FALSE) +
  geom_text(data = medians_amt,
            aes(x = bucket_mora, y = med,
                label = paste0("Med:\n", comma(round(med, 0)))),
            inherit.aes = FALSE,
            size = 2.8, vjust = -0.5, color = "gray20", fontface = "bold") +
  scale_fill_manual(values = PALETA_BUCKET) +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Monto por Tramo de Mora",
    subtitle = "Distribución de Amount_in_local_currency según Bucket_Mora (p1-p99)",
    x        = "Tramo de mora (Bucket)",
    y        = "Monto en moneda local",
    caption  = "Sin outliers extremos (p1-p99)"
  ) +
  tema
print(p1_1)

# ── 1.2 Boxplot: Arrears vs Bucket_Mora ───────────────────────────────────────
cat("  Generando 1.2: Boxplot Arrears vs Bucket_Mora...\n")

##   Generando 1.2: Boxplot Arrears vs Bucket_Mora...

medians_arr <- df_bkt %>%
  filter(!is.na(arrears)) %>%
  group_by(bucket_mora) %>%
  summarise(med = median(arrears, na.rm = TRUE), .groups = "drop")

p1_2 <- df_bkt %>%
  filter(!is.na(arrears),
         arrears >= lim_arr$inf, arrears <= lim_arr$sup) %>%
  ggplot(aes(x = bucket_mora, y = arrears, fill = bucket_mora)) +
  geom_boxplot(alpha = 0.75, outlier.alpha = 0.08,
               outlier.size = 0.6, show.legend = FALSE) +
  geom_text(data = medians_arr,
            aes(x = bucket_mora, y = med,
                label = paste0("Med: ", round(med, 0), "d")),
            inherit.aes = FALSE,
            size = 2.8, vjust = -0.6, color = "gray20", fontface = "bold") +
  scale_fill_manual(values = PALETA_BUCKET) +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Días de Atraso por Tramo de Mora",
    subtitle = "Arrears_after_net_due_date según Bucket_Mora (p1-p99)",
    x        = "Tramo de mora (Bucket)",
    y        = "Días de atraso",
    caption  = "Validación: cada bucket debe contener el rango de días que representa"
  ) +
  tema
print(p1_2)

# ── 1.3 Boxplot: Arrears vs Document_Type ─────────────────────────────────────
cat("  Generando 1.3: Boxplot Arrears vs Document_Type...\n")

##   Generando 1.3: Boxplot Arrears vs Document_Type...

# Ordenar tipos por mediana descendente, mostrar solo top 10
orden_dt <- df %>%
  filter(!is.na(arrears), !is.na(document_type)) %>%
  group_by(document_type) %>%
  summarise(med = median(arrears, na.rm = TRUE),
            n   = n(), .groups = "drop") %>%
  arrange(desc(med)) %>%
  slice_head(n = 10) %>%
  pull(document_type)

p1_3 <- df %>%
  filter(!is.na(arrears), document_type %in% orden_dt,
         arrears >= lim_arr$inf, arrears <= lim_arr$sup) %>%
  mutate(document_type = factor(document_type, levels = orden_dt)) %>%
  ggplot(aes(x = document_type, y = arrears, fill = document_type)) +
  geom_boxplot(alpha = 0.75, outlier.alpha = 0.1,
               outlier.size = 0.6, show.legend = FALSE) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Días de Atraso por Tipo de Documento",
    subtitle = "Top 10 tipos de documento, ordenados por mediana de arrears",
    x        = "Tipo de documento",
    y        = "Días de atraso",
    caption  = "p1-p99 para legibilidad"
  ) +
  tema +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))
print(p1_3)

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

# ── 1.4 Barras apiladas: Bucket_Mora vs Estado_Cartera ────────────────────────
cat("  Generando 1.4: Barras apiladas Bucket_Mora vs Estado_Cartera...\n")

##   Generando 1.4: Barras apiladas Bucket_Mora vs Estado_Cartera...

tab_bkt_estado <- df_bkt %>%
  filter(!is.na(estado_cartera)) %>%
  count(bucket_mora, estado_cartera) %>%
  group_by(bucket_mora) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  ungroup()

# Panel A: frecuencias absolutas apiladas
p1_4a <- ggplot(tab_bkt_estado,
                aes(x = bucket_mora, y = n, fill = estado_cartera)) +
  geom_col(alpha = 0.9, position = "stack") +
  scale_fill_manual(values = c("Abierta" = C2, "Cerrada" = C4),
                    name = "Estado cartera") +
  scale_y_continuous(labels = comma) +
  labs(title    = "Bucket_Mora × Estado_Cartera (absoluto)",
       x = "Tramo de mora", y = "Registros") +
  tema +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Panel B: distribución porcentual (100% apilado)
p1_4b <- ggplot(tab_bkt_estado,
                aes(x = bucket_mora, y = pct, fill = estado_cartera)) +
  geom_col(alpha = 0.9, position = "fill") +
  geom_text(aes(label = ifelse(pct > 5, paste0(pct, "%"), "")),
            position = position_fill(vjust = 0.5),
            size = 3, color = "white", fontface = "bold") +
  scale_fill_manual(values = c("Abierta" = C2, "Cerrada" = C4),
                    name = "Estado cartera") +
  scale_y_continuous(labels = percent) +
  labs(title    = "Bucket_Mora × Estado_Cartera (100% apilado)",
       subtitle = "Proporción de cartera abierta y cerrada por tramo",
       x = "Tramo de mora", y = "Proporción") +
  tema +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

panel_1_4 <- p1_4a / p1_4b +
  plot_annotation(
    title = "Relación: Tramo de Mora vs Estado de Cartera",
    theme = theme(plot.title = element_text(face = "bold", size = 14, color = C1))
  )
print(panel_1_4)

# ── 1.5 Scatter: Amount vs Arrears (muestra aleatoria para rendimiento) ────────
cat("  Generando 1.5: Scatter Amount vs Arrears...\n")

##   Generando 1.5: Scatter Amount vs Arrears...

# Con 341k filas un scatter completo es ilegible → muestra estratificada
set.seed(42)
df_sample <- df_clean %>%
  filter(!is.na(bucket_mora)) %>%
  group_by(bucket_mora) %>%
  slice_sample(prop = 1) %>%
  slice_head(n = 500) %>%              # hasta 500 por bucket
  ungroup()

cat("  Muestra para scatter:", nrow(df_sample), "puntos\n")

##   Muestra para scatter: 3000 puntos

# 1.5a Sin color por bucket
p1_5a <- ggplot(df_sample, aes(x = arrears, y = amount)) +
  geom_point(alpha = 0.25, size = 1.2, color = C1) +
  geom_smooth(method = "lm", color = C2, linewidth = 1,
              se = TRUE, linetype = "dashed") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Amount vs Arrears — Scatter general",
    subtitle = paste0("Muestra estratificada: ", nrow(df_sample),
                      " puntos (p1-p99)"),
    x        = "Días de atraso (Arrears)",
    y        = "Monto (Amount)",
    caption  = "Línea roja = tendencia lineal (OLS)"
  ) +
  tema

# 1.5b Coloreado por Bucket_Mora
p1_5b <- ggplot(df_sample,
                aes(x = arrears, y = amount, color = bucket_mora)) +
  geom_point(alpha = 0.35, size = 1.3) +
  scale_color_manual(values = PALETA_BUCKET, name = "Bucket mora") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  guides(color = guide_legend(override.aes = list(size = 3, alpha = 1))) +
  labs(
    title    = "Amount vs Arrears — coloreado por Bucket_Mora",
    subtitle = "Permite ver si los tramos de mora se distribuyen de forma distinta por monto",
    x        = "Días de atraso (Arrears)",
    y        = "Monto (Amount)"
  ) +
  tema

panel_scatter <- p1_5a / p1_5b +
  plot_annotation(
    title = "Scatter: Monto vs Días de Atraso",
    theme = theme(plot.title = element_text(face = "bold", size = 14, color = C1))
  )
print(panel_scatter)

## `geom_smooth()` using formula = 'y ~ x'

# ── Tabla de correlación Amount-Arrears por bucket (complemento del scatter) ──
tab_cor_bucket <- df_clean %>%
  filter(!is.na(bucket_mora)) %>%
  group_by(bucket_mora) %>%
  summarise(
    n   = n(),
    cor = round(cor(amount, arrears, use = "pairwise.complete.obs"), 3),
    .groups = "drop"
  )
cat("\n  Correlación Amount vs Arrears por Bucket_Mora:\n")

## 
##   Correlación Amount vs Arrears por Bucket_Mora:

print(tab_cor_bucket)

## # A tibble: 6 × 3
##   bucket_mora      n    cor
##   <ord>        <int>  <dbl>
## 1 Al dia      105111 -0.269
## 2 1-30        170224  0.105
## 3 31-60        31864 -0.033
## 4 61-90         9593 -0.01 
## 5 91-180        9858  0.001
## 6 181-360        969  0.014

# ═══════════════════════════════════════════════════════════════════════════════
#  SECCIÓN 2 ▸ ANÁLISIS TEMPORAL  (Year/month)
# ═══════════════════════════════════════════════════════════════════════════════

separador("SECCIÓN 2 — ANÁLISIS TEMPORAL")

## 
## ═════════════════════════════════════════════════════════════════
##   SECCIÓN 2 — ANÁLISIS TEMPORAL
## ═════════════════════════════════════════════════════════════════

# Preparar: convertir year_month "YYYY/MM" a fecha real para ordenar bien
df_temp <- df %>%
  filter(!is.na(year_month), year_month != "",
         !is.na(amount), !is.na(arrears)) %>%
  mutate(
    periodo = ym(str_replace(year_month, "/", "-")),  # "2023/03" → 2023-03-01
  ) %>%
  filter(!is.na(periodo)) %>%
  group_by(periodo, year_month) %>%
  summarise(
    monto_total     = sum(amount,  na.rm = TRUE),
    mora_promedio   = mean(arrears, na.rm = TRUE),
    n_documentos    = n(),
    .groups = "drop"
  ) %>%
  arrange(periodo)

cat("  Períodos con datos:", nrow(df_temp), "\n")

##   Períodos con datos: 51

cat("  Desde:", as.character(min(df_temp$periodo, na.rm=TRUE)),
    "hasta:", as.character(max(df_temp$periodo, na.rm=TRUE)), "\n\n")

##   Desde: 2022-01-01 hasta: 2026-03-01

# ── 2.1 Línea: Monto total por mes ────────────────────────────────────────────
cat("  Generando 2.1: Monto total por mes...\n")

##   Generando 2.1: Monto total por mes...

p2_1 <- ggplot(df_temp, aes(x = periodo, y = monto_total)) +
  geom_area(fill = C1, alpha = 0.15) +
  geom_line(color = C1, linewidth = 1) +
  geom_point(color = C1, size = 1.8, alpha = 0.8) +
  # Marcar máximo y mínimo
  geom_point(data = df_temp %>% filter(monto_total == max(monto_total)),
             aes(x = periodo, y = monto_total),
             color = C4, size = 3.5, shape = 17, inherit.aes = FALSE) +
  geom_point(data = df_temp %>% filter(monto_total == min(monto_total)),
             aes(x = periodo, y = monto_total),
             color = C2, size = 3.5, shape = 25, fill = C2,
             inherit.aes = FALSE) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b\n%Y") +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Monto Total por Período (Year/Month)",
    subtitle = "▲ verde = máximo  |  ▼ rojo = mínimo",
    x        = NULL,
    y        = "Monto total (moneda local)"
  ) +
  tema +
  theme(axis.text.x = element_text(size = 8))
print(p2_1)

# ── 2.2 Línea: Mora promedio por mes ──────────────────────────────────────────
cat("  Generando 2.2: Mora promedio por mes...\n")

##   Generando 2.2: Mora promedio por mes...

# Media global para referencia
mora_global <- mean(df$arrears, na.rm = TRUE)

p2_2 <- ggplot(df_temp, aes(x = periodo, y = mora_promedio)) +
  geom_hline(yintercept = mora_global, linetype = "dashed",
             color = "gray60", linewidth = 0.8) +
  annotate("text", x = min(df_temp$periodo), y = mora_global,
           label = paste0("Media global: ", round(mora_global, 1), "d"),
           hjust = 0, vjust = -0.5, size = 3, color = "gray50") +
  geom_area(fill = C2, alpha = 0.12) +
  geom_line(color = C2, linewidth = 1) +
  geom_point(color = C2, size = 1.8, alpha = 0.8) +
  scale_x_date(date_breaks = "3 months", date_labels = "%b\n%Y") +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Mora Promedio por Período (Year/Month)",
    subtitle = "Línea gris = promedio global de Arrears_after_net_due_date",
    x        = NULL,
    y        = "Días de atraso promedio"
  ) +
  tema +
  theme(axis.text.x = element_text(size = 8))
print(p2_2)

# ── 2.3 Barras: Número de documentos por mes ──────────────────────────────────
cat("  Generando 2.3: Documentos por mes...\n")

##   Generando 2.3: Documentos por mes...

p2_3 <- ggplot(df_temp, aes(x = periodo, y = n_documentos)) +
  geom_col(fill = C1, alpha = 0.85, width = 20) +
  geom_text(aes(label = comma(n_documentos)),
            vjust = -0.4, size = 2.5, color = "gray30") +
  scale_x_date(date_breaks = "3 months", date_labels = "%b\n%Y") +
  scale_y_continuous(labels = comma,
                     expand = expansion(mult = c(0, 0.12))) +
  labs(
    title    = "Número de Documentos por Período (Year/Month)",
    subtitle = "Cantidad de registros ingresados cada mes",
    x        = NULL,
    y        = "N° de documentos"
  ) +
  tema +
  theme(axis.text.x = element_text(size = 8))
print(p2_3)

# ── Panel temporal unificado ───────────────────────────────────────────────────
panel_temporal <- p2_1 / p2_2 / p2_3 +
  plot_annotation(
    title   = "Panel Temporal — Evolución de la Cartera por Mes",
    caption = paste("Períodos:", nrow(df_temp),
                    " | Desde:", format(min(df_temp$periodo), "%b %Y"),
                    "hasta:", format(max(df_temp$periodo), "%b %Y")),
    theme   = theme(
      plot.title   = element_text(face = "bold", size = 14, color = C1),
      plot.caption = element_text(size = 8, color = "gray50")
    )
  )
print(panel_temporal)

# ═══════════════════════════════════════════════════════════════════════════════
#  SECCIÓN 3 ▸ CORRELACIÓN — HEATMAP
# ═══════════════════════════════════════════════════════════════════════════════

separador("SECCIÓN 3 — CORRELACIÓN (HEATMAP)")

## 
## ═════════════════════════════════════════════════════════════════
##   SECCIÓN 3 — CORRELACIÓN (HEATMAP)
## ═════════════════════════════════════════════════════════════════

cat("  Generando matriz de correlación...\n")

##   Generando matriz de correlación...

# Variables numéricas disponibles + derivadas útiles
df_num <- df %>%
  mutate(
    bucket_num   = as.numeric(bucket_mora),     # ordinal → numérico
    estado_num   = if_else(estado_cartera == "Abierta", 1L, 0L, NA_integer_),
    doc_type_num = as.numeric(factor(document_type))
  ) %>%
  select(
    `Arrears`      = arrears,
    `Amount`       = amount,
    `Bucket (ord)` = bucket_num,
    `Estado (bin)` = estado_num,
    `Doc Type`     = doc_type_num
  ) %>%
  drop_na()

mat_cor <- cor(df_num, use = "pairwise.complete.obs", method = "pearson")
cat("\n  Matriz de correlación de Pearson:\n")

## 
##   Matriz de correlación de Pearson:

print(round(mat_cor, 3))

##              Arrears Amount Bucket (ord) Estado (bin) Doc Type
## Arrears        1.000 -0.002        0.287        0.005   -0.062
## Amount        -0.002  1.000       -0.010        0.017    0.074
## Bucket (ord)   0.287 -0.010        1.000       -0.009   -0.134
## Estado (bin)   0.005  0.017       -0.009        1.000    0.047
## Doc Type      -0.062  0.074       -0.134        0.047    1.000

# Heatmap con ggcorrplot
p3_heat <- ggcorrplot(
  mat_cor,
  method    = "square",
  type      = "lower",
  lab       = TRUE,
  lab_size  = 4.5,
  colors    = c(C2, "white", C1),
  outline.color = "white",
  tl.cex    = 10,
  title     = "Matriz de Correlación — Variables Numéricas y Derivadas",
  ggtheme   = theme_minimal(base_size = 11)
) +
  theme(
    plot.title = element_text(face = "bold", size = 13, color = C1),
    legend.title = element_text(size = 9)
  )

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(p3_heat)

# ── Interpretación automática de correlaciones fuertes ────────────────────────
cat("\n  Correlaciones con |r| > 0.3:\n")

## 
##   Correlaciones con |r| > 0.3:

cor_long <- as.data.frame(as.table(mat_cor)) %>%
  rename(Var1 = Var1, Var2 = Var2, Correlacion = Freq) %>%
  filter(as.character(Var1) < as.character(Var2),
         abs(Correlacion) > 0.3) %>%
  mutate(Correlacion = round(Correlacion, 3)) %>%
  arrange(desc(abs(Correlacion)))

if (nrow(cor_long) > 0) {
  print(cor_long)
} else {
  cat("  No se encontraron correlaciones con |r| > 0.3\n")
}

##   No se encontraron correlaciones con |r| > 0.3

# ═══════════════════════════════════════════════════════════════════════════════
#  SECCIÓN 4 ▸ DISTRIBUCIONES AVANZADAS
# ═══════════════════════════════════════════════════════════════════════════════

separador("SECCIÓN 4 — DISTRIBUCIONES AVANZADAS")

## 
## ═════════════════════════════════════════════════════════════════
##   SECCIÓN 4 — DISTRIBUCIONES AVANZADAS
## ═════════════════════════════════════════════════════════════════

# ── 4.1 Density plot global de Amount ─────────────────────────────────────────
cat("  Generando 4.1: Density plot Amount (global)...\n")

##   Generando 4.1: Density plot Amount (global)...

media_amt <- mean(df_clean$amount, na.rm = TRUE)
med_amt   <- median(df_clean$amount, na.rm = TRUE)

p4_1 <- ggplot(df_clean, aes(x = amount)) +
  geom_density(fill = C1, color = C1, alpha = 0.40, linewidth = 1) +
  geom_vline(xintercept = media_amt, color = C2,
             linetype = "dashed", linewidth = 1) +
  geom_vline(xintercept = med_amt,  color = C3,
             linetype = "dashed", linewidth = 1) +
  annotate("text", x = media_amt, y = 0,
           label = paste0("Media\n", comma(round(media_amt, 0))),
           hjust = -0.1, vjust = 0, color = C2, size = 3.2, fontface = "bold") +
  annotate("text", x = med_amt, y = 0,
           label = paste0("Mediana\n", comma(round(med_amt, 0))),
           hjust =  1.1, vjust = 0, color = C3, size = 3.2, fontface = "bold") +
  scale_x_continuous(labels = comma) +
  labs(
    title    = "Densidad de Amount_in_local_currency",
    subtitle = "Distribución global (p1-p99) | Rojo = Media | Naranja = Mediana",
    x        = "Monto (moneda local)",
    y        = "Densidad"
  ) +
  tema
print(p4_1)

# ── 4.2 Density plot de Amount segmentado por Bucket_Mora ────────────────────
cat("  Generando 4.2: Density plot Amount por Bucket_Mora...\n")

##   Generando 4.2: Density plot Amount por Bucket_Mora...

p4_2 <- df_clean %>%
  filter(!is.na(bucket_mora)) %>%
  mutate(bucket_mora = factor(bucket_mora, levels = ORDEN_BUCKET)) %>%
  ggplot(aes(x = amount, fill = bucket_mora, color = bucket_mora)) +
  geom_density(alpha = 0.30, linewidth = 0.8) +
  scale_fill_manual(values  = PALETA_BUCKET, name = "Bucket mora") +
  scale_color_manual(values = PALETA_BUCKET, name = "Bucket mora") +
  scale_x_continuous(labels = comma) +
  guides(
    fill  = guide_legend(nrow = 1, override.aes = list(alpha = 0.7)),
    color = guide_legend(nrow = 1)
  ) +
  labs(
    title    = "Densidad de Amount por Tramo de Mora",
    subtitle = "Cada curva representa la distribución de montos en ese bucket (p1-p99)",
    x        = "Monto (moneda local)",
    y        = "Densidad",
    caption  = "Solapamiento de curvas revela si los tramos de mora tienen montos similares"
  ) +
  tema
print(p4_2)

# ── 4.2b Facetas: un panel por bucket ─────────────────────────────────────────
cat("  Generando 4.2b: Density facetado por Bucket_Mora...\n")

##   Generando 4.2b: Density facetado por Bucket_Mora...

p4_2b <- df_clean %>%
  filter(!is.na(bucket_mora)) %>%
  mutate(bucket_mora = factor(bucket_mora, levels = ORDEN_BUCKET)) %>%
  ggplot(aes(x = amount, fill = bucket_mora)) +
  geom_histogram(bins = 40, alpha = 0.85, color = "white") +
  facet_wrap(~ bucket_mora, scales = "free_y", ncol = 2) +
  scale_fill_manual(values = PALETA_BUCKET, guide = "none") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Distribución de Amount por Bucket_Mora (facetas)",
    subtitle = "Eje Y libre para comparar formas de distribución dentro de cada tramo",
    x        = "Monto (p1-p99)",
    y        = "Frecuencia"
  ) +
  tema +
  theme(strip.text = element_text(face = "bold", color = C1))
print(p4_2b)

# ── 4.3 Violin: Arrears por Bucket (complemento elegante del boxplot) ─────────
cat("  Generando 4.3: Violin Arrears por Bucket_Mora...\n")

##   Generando 4.3: Violin Arrears por Bucket_Mora...

p4_3 <- df_bkt %>%
  filter(!is.na(arrears),
         arrears >= lim_arr$inf, arrears <= lim_arr$sup) %>%
  ggplot(aes(x = bucket_mora, y = arrears, fill = bucket_mora)) +
  geom_violin(alpha = 0.65, trim = TRUE, scale = "width",
              show.legend = FALSE) +
  geom_boxplot(width = 0.12, fill = "white", alpha = 0.8,
               outlier.shape = NA, show.legend = FALSE) +
  scale_fill_manual(values = PALETA_BUCKET) +
  scale_y_continuous(labels = comma) +
  labs(
    title    = "Distribución de Arrears por Bucket_Mora (Violin + Boxplot)",
    subtitle = "El ancho del violín indica densidad de datos en ese rango de días",
    x        = "Tramo de mora",
    y        = "Días de atraso",
    caption  = "p1-p99 para legibilidad"
  ) +
  tema
print(p4_3)

# ═══════════════════════════════════════════════════════════════════════════════
#  SECCIÓN 5 ▸ CALIDAD DE DATOS — NULOS POR VARIABLE
# ═══════════════════════════════════════════════════════════════════════════════

separador("SECCIÓN 5 — CALIDAD DE DATOS (VALORES NULOS)")

## 
## ═════════════════════════════════════════════════════════════════
##   SECCIÓN 5 — CALIDAD DE DATOS (VALORES NULOS)
## ═════════════════════════════════════════════════════════════════

cat("  Generando gráfico de nulos por variable...\n")

##   Generando gráfico de nulos por variable...

tab_nulos <- df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(),
               names_to  = "Variable",
               values_to = "N_Nulos") %>%
  mutate(
    Pct_Nulos  = round(N_Nulos / nrow(df) * 100, 2),
    N_Completos = nrow(df) - N_Nulos,
    Severidad  = case_when(
      Pct_Nulos == 0   ~ "Sin nulos",
      Pct_Nulos <=  5  ~ "Bajo (<5%)",
      Pct_Nulos <= 20  ~ "Moderado (5-20%)",
      Pct_Nulos <= 50  ~ "Alto (20-50%)",
      TRUE              ~ "Crítico (>50%)"
    ),
    Severidad = factor(Severidad,
                       levels = c("Sin nulos","Bajo (<5%)","Moderado (5-20%)",
                                  "Alto (20-50%)","Crítico (>50%)"))
  ) %>%
  arrange(desc(Pct_Nulos))

cat("\n  Tabla de nulos por variable:\n")

## 
##   Tabla de nulos por variable:

print(tab_nulos, n = Inf)

## # A tibble: 14 × 5
##    Variable         N_Nulos Pct_Nulos N_Completos Severidad       
##    <chr>              <int>     <dbl>       <int> <fct>           
##  1 reason_code       305226     89.5        35801 Crítico (>50%)  
##  2 terms_of_payment   58231     17.1       282796 Moderado (5-20%)
##  3 clearing_date      11974      3.51      329053 Bajo (<5%)      
##  4 anon_customer_id       0      0         341027 Sin nulos       
##  5 anon_document_id       0      0         341027 Sin nulos       
##  6 document_type          0      0         341027 Sin nulos       
##  7 document_date          0      0         341027 Sin nulos       
##  8 payment_date           0      0         341027 Sin nulos       
##  9 net_due_date           0      0         341027 Sin nulos       
## 10 arrears                0      0         341027 Sin nulos       
## 11 amount                 0      0         341027 Sin nulos       
## 12 year_month             0      0         341027 Sin nulos       
## 13 estado_cartera         0      0         341027 Sin nulos       
## 14 bucket_mora            0      0         341027 Sin nulos

col_severidad <- c(
  "Sin nulos"        = C4,
  "Bajo (<5%)"       = "#85C17E",
  "Moderado (5-20%)" = C3,
  "Alto (20-50%)"    = C2,
  "Crítico (>50%)"   = "#7B0000"
)

# ── 5.1 Barras: cantidad de nulos por variable ─────────────────────────────────
p5_1 <- ggplot(tab_nulos,
               aes(x = reorder(Variable, N_Nulos),
                   y = N_Nulos,
                   fill = Severidad)) +
  geom_col(alpha = 0.9) +
  geom_text(aes(label = ifelse(N_Nulos > 0,
                               paste0(comma(N_Nulos), "\n(", Pct_Nulos, "%)"),
                               "0")),
            hjust = -0.05, size = 3.2, color = "gray20") +
  coord_flip() +
  scale_fill_manual(values = col_severidad, name = "Severidad") +
  scale_y_continuous(labels = comma,
                     expand = expansion(mult = c(0, 0.25))) +
  labs(
    title    = "Valores Nulos por Variable",
    subtitle = paste0("Base: ", formatC(nrow(df), big.mark = ","),
                      " registros | ",
                      sum(tab_nulos$N_Nulos == 0), " variables sin nulos"),
    x        = NULL,
    y        = "Cantidad de nulos"
  ) +
  tema +
  theme(legend.position = "right")
print(p5_1)

# ── 5.2 Heatmap de completitud: variable × muestra de filas ───────────────────
cat("  Generando 5.2: Heatmap de completitud...\n")

##   Generando 5.2: Heatmap de completitud...

set.seed(99)
df_miss_sample <- df %>%
  slice_head(n = 300) %>%
  # Convertir todo a character antes de pivotar: evita el error
  # "Can't combine <character> and <date>" de pivot_longer
  mutate(across(everything(), as.character)) %>%
  mutate(fila = row_number()) %>%
  pivot_longer(-fila, names_to = "Variable", values_to = "Valor") %>%
  mutate(Es_NA = is.na(Valor) | Valor == "NA")

p5_2 <- ggplot(df_miss_sample, aes(x = Variable, y = fila, fill = Es_NA)) +
  geom_tile() +
  scale_fill_manual(values = c("FALSE" = C1, "TRUE" = C2),
                    labels = c("Completo", "Nulo"),
                    name   = "Estado") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(
    title    = "Heatmap de Completitud (muestra 300 filas)",
    subtitle = "Rojo = nulo  |  Azul = dato presente",
    x        = "Variable",
    y        = "Fila (muestra)"
  ) +
  tema +
  theme(
    axis.text.x  = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y  = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid   = element_blank()
  )
print(p5_2)

# ── 5.3 Resumen: completitud global en barra horizontal ───────────────────────
p5_3 <- tab_nulos %>%
  mutate(
    Completos_pct = 100 - Pct_Nulos,
    Variable = factor(Variable, levels = rev(tab_nulos$Variable))
  ) %>%
  pivot_longer(cols = c(Completos_pct, Pct_Nulos),
               names_to = "Tipo", values_to = "Pct") %>%
  mutate(Tipo = if_else(Tipo == "Completos_pct", "Completo", "Nulo")) %>%
  ggplot(aes(x = Variable, y = Pct, fill = Tipo)) +
  geom_col(alpha = 0.9) +
  geom_text(data = tab_nulos %>%
              filter(Pct_Nulos > 0) %>%
              mutate(Variable = factor(Variable, levels = rev(tab_nulos$Variable))),
            aes(x = Variable, y = 100 - Pct_Nulos / 2,
                label = paste0(Pct_Nulos, "%")),
            inherit.aes = FALSE,
            size = 3, color = "white", fontface = "bold") +
  scale_fill_manual(values = c("Completo" = C1, "Nulo" = C2), name = NULL) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  coord_flip() +
  labs(
    title    = "Completitud por Variable (100% apilado)",
    subtitle = "Azul = dato presente  |  Rojo = nulo",
    x        = NULL,
    y        = "% de registros"
  ) +
  tema
print(p5_3)

# ═══════════════════════════════════════════════════════════════════════════════
#  SECCIÓN 6 ▸ SEGMENTACIÓN POR CLIENTE
# ═══════════════════════════════════════════════════════════════════════════════

separador("SECCIÓN 6 — SEGMENTACIÓN POR CLIENTE")

## 
## ═════════════════════════════════════════════════════════════════
##   SECCIÓN 6 — SEGMENTACIÓN POR CLIENTE
## ═════════════════════════════════════════════════════════════════

N_TOP <- 10   # número de top clientes a mostrar

# ── 6.1 Top N clientes por monto total ────────────────────────────────────────
cat("  Generando 6.1: Top", N_TOP, "clientes por monto total...\n")

##   Generando 6.1: Top 10 clientes por monto total...

tab_top_cust <- df %>%
  filter(!is.na(amount)) %>%
  group_by(anon_customer_id) %>%
  summarise(
    monto_total  = sum(amount, na.rm = TRUE),
    n_docs       = n(),
    mora_prom    = round(mean(arrears, na.rm = TRUE), 1),
    pct_mora_crit = round(
      mean(arrears > 90, na.rm = TRUE) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(monto_total)) %>%
  slice_head(n = N_TOP) %>%
  mutate(
    pct_monto  = round(monto_total / sum(df$amount, na.rm = TRUE) * 100, 2),
    monto_lbl  = paste0("$", comma(round(monto_total / 1e6, 1)), "M")
  )

cat("\n  Top", N_TOP, "clientes por monto total:\n")

## 
##   Top 10 clientes por monto total:

print(tab_top_cust %>%
        select(anon_customer_id, monto_total, pct_monto,
               n_docs, mora_prom, pct_mora_crit))

## # A tibble: 10 × 6
##    anon_customer_id  monto_total pct_monto n_docs mora_prom pct_mora_crit
##    <chr>                   <dbl>     <dbl>  <int>     <dbl>         <dbl>
##  1 CUST_317         20676237893.     12.0   65778      26.1           6.2
##  2 CUST_218         19482750020.     11.3    1065       5             0.2
##  3 CUST_215         17363198631.     10.1    2093      36.4          13.2
##  4 CUST_263         10633703698.      6.16   6608      25.6           4.1
##  5 CUST_281          9744251100       5.64   3214       1.9           0.6
##  6 CUST_216          7224419079.      4.18  19360      14.2           3.9
##  7 CUST_17           5564459842.      3.22   5016      14.3           3.3
##  8 CUST_95           5492939964.      3.18  24522      25.9           8.1
##  9 CUST_141          4723907294.      2.74   5638      17             4.8
## 10 CUST_133          4568406870.      2.65    335      -3.9           0

p6_1 <- ggplot(tab_top_cust,
               aes(x = reorder(anon_customer_id, monto_total),
                   y = monto_total)) +
  geom_col(aes(fill = pct_mora_crit), alpha = 0.9) +
  geom_text(aes(label = paste0(monto_lbl, "\n(", pct_monto, "%)")),
            hjust = -0.05, size = 3.2, color = "gray20") +
  scale_fill_gradient(
    low  = C4, high = C2,
    name = "% mora crítica\n(arrears > 90d)"
  ) +
  scale_y_continuous(labels = comma,
                     expand = expansion(mult = c(0, 0.25))) +
  coord_flip() +
  labs(
    title    = paste0("Top ", N_TOP, " Clientes por Monto Total"),
    subtitle = "Color = % de documentos en mora crítica (>90 días)",
    x        = NULL,
    y        = "Monto total (moneda local)"
  ) +
  tema +
  theme(legend.position = "right")
print(p6_1)

# ── 6.2 Boxplot de mora por cliente (top N) ───────────────────────────────────
cat("  Generando 6.2: Boxplot de mora por cliente (Top", N_TOP, ")...\n")

##   Generando 6.2: Boxplot de mora por cliente (Top 10 )...

top_ids <- tab_top_cust$anon_customer_id

df_top_cust <- df %>%
  filter(anon_customer_id %in% top_ids, !is.na(arrears)) %>%
  mutate(anon_customer_id = factor(anon_customer_id,
                                   levels = rev(top_ids)))

# Verificar que hay suficientes datos por cliente
n_por_cliente <- df_top_cust %>%
  count(anon_customer_id) %>%
  filter(n >= 5)

if (nrow(n_por_cliente) >= 2) {

  df_top_box <- df_top_cust %>%
    filter(anon_customer_id %in% n_por_cliente$anon_customer_id,
           arrears >= lim_arr$inf, arrears <= lim_arr$sup)

  medianas_cli <- df_top_box %>%
    group_by(anon_customer_id) %>%
    summarise(med = median(arrears, na.rm = TRUE), .groups = "drop")

  p6_2 <- ggplot(df_top_box,
                 aes(x = anon_customer_id, y = arrears,
                     fill = anon_customer_id)) +
    geom_boxplot(alpha = 0.70, outlier.alpha = 0.15,
                 outlier.size = 0.7, show.legend = FALSE) +
    geom_text(data = medianas_cli,
              aes(x = anon_customer_id, y = med,
                  label = paste0(round(med, 0), "d")),
              inherit.aes = FALSE,
              hjust = -0.2, size = 3, color = "gray20", fontface = "bold") +
    geom_hline(yintercept = 0, linetype = "dashed",
               color = C4, linewidth = 0.8) +
    coord_flip() +
    scale_fill_manual(values = setNames(
      colorRampPalette(c(C1, "#8E44AD", C2))(nrow(n_por_cliente)),
      n_por_cliente$anon_customer_id
    )) +
    scale_y_continuous(labels = comma) +
    labs(
      title    = paste0("Distribución de Días de Atraso — Top ", N_TOP, " Clientes"),
      subtitle = "Línea verde = 0 días (pago puntual) | Etiqueta = mediana",
      x        = NULL,
      y        = "Días de atraso (Arrears) — p1-p99",
      caption  = paste0("Solo clientes con ≥5 registros | n = ",
                        nrow(df_top_box), " documentos")
    ) +
    tema
  print(p6_2)

} else {
  cat("  ⚠ Pocos clientes con ≥5 registros en el top", N_TOP,
      "— se omite boxplot por cliente\n")
}

# ── 6.3 Segmentación: bucket_mora por top cliente ─────────────────────────────
cat("  Generando 6.3: Bucket_Mora por top cliente...\n")

##   Generando 6.3: Bucket_Mora por top cliente...

tab_cust_bucket <- df %>%
  filter(anon_customer_id %in% top_ids, !is.na(bucket_mora)) %>%
  mutate(
    anon_customer_id = factor(anon_customer_id, levels = rev(top_ids)),
    bucket_mora      = factor(bucket_mora, levels = ORDEN_BUCKET)
  ) %>%
  count(anon_customer_id, bucket_mora) %>%
  group_by(anon_customer_id) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  ungroup()

p6_3 <- ggplot(tab_cust_bucket,
               aes(x = anon_customer_id, y = pct, fill = bucket_mora)) +
  geom_col(alpha = 0.9, position = "fill") +
  geom_text(aes(label = ifelse(pct > 6, paste0(pct, "%"), "")),
            position = position_fill(vjust = 0.5),
            size = 2.8, color = "white", fontface = "bold") +
  scale_fill_manual(values = PALETA_BUCKET, name = "Bucket mora") +
  scale_y_continuous(labels = percent) +
  coord_flip() +
  labs(
    title    = paste0("Perfil de Mora por Cliente — Top ", N_TOP),
    subtitle = "Distribución porcentual de Bucket_Mora dentro de cada cliente",
    x        = NULL,
    y        = "% de documentos"
  ) +
  tema
print(p6_3)

# ═══════════════════════════════════════════════════════════════════════════════
#  PANEL RESUMEN FINAL — todos los hallazgos clave en una sola vista
# ═══════════════════════════════════════════════════════════════════════════════

separador("PANEL RESUMEN EJECUTIVO")

## 
## ═════════════════════════════════════════════════════════════════
##   PANEL RESUMEN EJECUTIVO
## ═════════════════════════════════════════════════════════════════

cat("  Generando panel ejecutivo...\n")

##   Generando panel ejecutivo...

# Mini versiones de los gráficos más importantes
mini_tema <- tema +
  theme(plot.title = element_text(size = 10),
        plot.subtitle = element_blank(),
        axis.text = element_text(size = 7),
        axis.title = element_text(size = 8),
        legend.text = element_text(size = 7),
        legend.title = element_text(size = 7))

# A: Estado cartera
tab_e <- df %>% count(estado_cartera) %>%
  mutate(pct = round(n/sum(n)*100,1))

pA <- ggplot(tab_e, aes(x="", y=n, fill=estado_cartera)) +
  geom_col(width=1, color="white") +
  coord_polar("y") +
  geom_text(aes(label=paste0(pct,"%")),
            position=position_stack(vjust=0.5),
            color="white", fontface="bold", size=4) +
  scale_fill_manual(values=c("Abierta"=C2,"Cerrada"=C4), name=NULL) +
  labs(title="Estado Cartera") +
  theme_void(base_size=10) +
  theme(plot.title=element_text(face="bold",color=C1,size=11),
        legend.position="bottom")

# B: Bucket mora
tab_b <- df %>% count(bucket_mora) %>%
  mutate(pct=round(n/sum(n)*100,1),
         bucket_mora=factor(bucket_mora,levels=ORDEN_BUCKET))

pB <- ggplot(tab_b, aes(x=bucket_mora, y=n, fill=bucket_mora)) +
  geom_col(alpha=0.9, show.legend=FALSE) +
  geom_text(aes(label=paste0(pct,"%")), vjust=-0.3, size=3) +
  scale_fill_manual(values=PALETA_BUCKET) +
  scale_y_continuous(labels=comma, expand=expansion(mult=c(0,.12))) +
  labs(title="Distribución Bucket Mora", x=NULL, y="Registros") +
  mini_tema +
  theme(axis.text.x=element_text(angle=30,hjust=1))

# C: Monto total por mes (mini)
pC <- ggplot(df_temp, aes(x=periodo, y=monto_total)) +
  geom_area(fill=C1, alpha=0.2) +
  geom_line(color=C1, linewidth=0.8) +
  scale_x_date(date_labels="%y") +
  scale_y_continuous(labels=comma) +
  labs(title="Monto mensual", x=NULL, y="Monto") +
  mini_tema

# D: Top 5 clientes
pD <- tab_top_cust %>% slice_head(n=5) %>%
  ggplot(aes(x=reorder(anon_customer_id,monto_total), y=monto_total)) +
  geom_col(fill=C1, alpha=0.85) +
  geom_text(aes(label=monto_lbl), hjust=-0.1, size=3) +
  coord_flip() +
  scale_y_continuous(labels=comma, expand=expansion(mult=c(0,.3))) +
  labs(title="Top 5 Clientes", x=NULL, y="Monto") +
  mini_tema

# E: Nulos
pE <- tab_nulos %>% filter(N_Nulos>0) %>%
  ggplot(aes(x=reorder(Variable,N_Nulos), y=Pct_Nulos, fill=Severidad)) +
  geom_col(alpha=0.9) +
  coord_flip() +
  scale_fill_manual(values=col_severidad, guide="none") +
  scale_y_continuous(labels=function(x) paste0(x,"%")) +
  labs(title="% Nulos por Variable", x=NULL, y="%") +
  mini_tema

# F: Scatter mini
pF <- df_sample %>% slice_head(n = 1000) %>%
  ggplot(aes(x=arrears, y=amount, color=bucket_mora)) +
  geom_point(alpha=0.3, size=0.8) +
  scale_color_manual(values=PALETA_BUCKET, guide="none") +
  scale_x_continuous(labels=comma) +
  scale_y_continuous(labels=comma) +
  labs(title="Scatter Amount vs Arrears", x="Arrears", y="Amount") +
  mini_tema

panel_ejecutivo <- (pA | pB | pC) / (pD | pE | pF) +
  plot_annotation(
    title   = "Panel Ejecutivo — EDA Avanzado Base I2C",
    subtitle = paste0(formatC(nrow(df), big.mark=","),
                      " registros  |  14 variables  |  ",
                      n_distinct(df$anon_customer_id), " clientes"),
    caption = paste("Generado:", format(Sys.time(), "%d/%m/%Y %H:%M")),
    theme   = theme(
      plot.title    = element_text(face="bold", size=16, color=C1),
      plot.subtitle = element_text(size=11, color="gray40"),
      plot.caption  = element_text(size=8, color="gray55")
    )
  )
print(panel_ejecutivo)

# ═══════════════════════════════════════════════════════════════════════════════
#  FIN DEL SCRIPT
# ═══════════════════════════════════════════════════════════════════════════════

cat("\n", strrep("═", 65), "\n", sep="")

## 
## ═════════════════════════════════════════════════════════════════

cat("  ✅  ANÁLISIS AVANZADO COMPLETADO\n\n")

##   ✅  ANÁLISIS AVANZADO COMPLETADO

cat("  Gráficos generados:\n")

##   Gráficos generados:

cat("    SEC 1 — Bivariado  : 5 gráficos (boxplots, barras apiladas, scatter)\n")

##     SEC 1 — Bivariado  : 5 gráficos (boxplots, barras apiladas, scatter)

cat("    SEC 2 — Temporal   : 3 gráficos + panel unificado\n")

##     SEC 2 — Temporal   : 3 gráficos + panel unificado

cat("    SEC 3 — Correlación: heatmap + tabla de correlaciones fuertes\n")

##     SEC 3 — Correlación: heatmap + tabla de correlaciones fuertes

cat("    SEC 4 — Avanzadas  : density, density por bucket, violin, facetas\n")

##     SEC 4 — Avanzadas  : density, density por bucket, violin, facetas

cat("    SEC 5 — Calidad    : barras nulos, heatmap completitud, 100% apilado\n")

##     SEC 5 — Calidad    : barras nulos, heatmap completitud, 100% apilado

cat("    SEC 6 — Clientes   : top monto, boxplot mora, perfil bucket\n")

##     SEC 6 — Clientes   : top monto, boxplot mora, perfil bucket

cat("    PANEL EJECUTIVO    : resumen 6 gráficos clave en una vista\n")

##     PANEL EJECUTIVO    : resumen 6 gráficos clave en una vista

cat("\n  Para exportar cualquier gráfico:\n")

## 
##   Para exportar cualquier gráfico:

cat("    ggsave('nombre.png', plot = last_plot(), width=14, height=8, dpi=150)\n")

##     ggsave('nombre.png', plot = last_plot(), width=14, height=8, dpi=150)

cat(strrep("═", 65), "\n", sep="")

## ═════════════════════════════════════════════════════════════════

FINAL

2026-04-10