Librerías a utilizar

library(caret) # para preprocesado (scaling), nearzerovar y findcorrelation
## Loading required package: lattice
## Loading required package: ggplot2
library(DMwR) # para el balanceo SMOTE
## Warning: package 'DMwR' was built under R version 4.0.2
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(car) # para prueba de inflación de la varianza en la regresión
## Loading required package: carData
library(pROC) # para calcular el AUC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROCR) # para calcular el AUC
library(glmnet) # para regresión elastic net
## Warning: package 'glmnet' was built under R version 4.0.2
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(randomForest) # para random forest
## Warning: package 'randomForest' was built under R version 4.0.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(adabag) # para adaboost
## Loading required package: rpart
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel

Cargar y Explorar el dataset

# definir directorio de trabajo
setwd("C:/Users/licja/Downloads/Prueba BAM Javier Saravia") 
getwd()
## [1] "C:/Users/licja/Downloads/Prueba BAM Javier Saravia"
original <- read.csv("Base_entrenamiento.csv", header = TRUE)
dim(original) # cargadas 20,000 filas y 124 columnas
## [1] 20000   124
data <- original # sobre este dataset se hará el feature engineer

Omitir valores atípicos

Los valores atípicos para cada variable se identificaron visualmente a través de histogramas para luego cuantificarlos a través de un análisis de percentiles

# Para los valores atípicos detectados se colocarán acá sus respectivos subset
data <- subset(data, data$pc_transaccional <= 136903394) 
data <- subset(data, data$dmi_max_egreso_diario <= 85362328)
data <- subset(data, data$dmi_max_ingreso_diario <= 104909227.3)
data <- subset(data, data$dh_val_otros_d <= 9998335.80)
data <- subset(data, data$dmi_ingreso_total_mes <= 205163705.0)
data <- subset(data, data$dh_val_pagos_d <= 50357360.00)
data <- subset(data, data$pc_gasto_familiar <= 26582180.19)
data <- subset(data, data$pc_cuotas_pagadas <= 8156850.0)
data <- subset(data, data$dmi_egreso_total_mes <= 209829245.0)
data <- subset(data, data$dh_val_salidas <= 217011100)
data <- subset(data, data$cpc_sum_saldo <= 220905770.00)
data <- subset(data, data$cpc_saldo_sobre_ing <= 20.307294513)
data <- subset(data, data$cpc_saldo_tdc <= 43186883.96)
data <- subset(data, data$dh_cant_tras_d <= 31)
data <- subset(data, data$cpc_avg_saldo <= 106884393.50)
data <- subset(data, data$dh_val_pago_tarj_d <= 10560315.0)
data <- subset(data, data$pc_transaccional <= 92269310.0)

Exploración y Transformación de variables

Si bien este es un ejercicio, pueden hacerse las siguientes observaciones y sugerencias:

Sugiero variables como:

# Se crearán variables independientes ya con transformaciones
# Exceptuando las columnas que no se consideren aptas para el modelo
# Posteriormente, se creará un solo dataset con todas ellas

# Se crearán variables numéricas para usarse en los modelos lineales (reg logística y elastic net)

# Se determinará qué variables dejar evaluando que la varianza no sea cercana a 0
# Además, se eliminará las variables independientes con alta correlación entre sí
# Muchas variables tienen frecuencia alta cerca de 0 (asimetría positiva), por lo que seguramente serán descartadas

names(data)[1] # max_trim: máximo días de mora trimestre anterior
## [1] "max_trim"
hist(data$max_trim)

hist(sqrt(data$max_trim)) # variable normalizada con sqrt

max_trim <- sqrt(data$max_trim)

names(data)[2] # max_sem: máximo días de mora en el semestre anterior
## [1] "max_sem"
hist(data$max_sem)

hist(sqrt(data$max_sem))

max_sem <- data$max_sem

names(data)[3] # desv_sem: desviación estándar del máximo de moras en el semestre anterior
## [1] "desv_sem"
hist(data$desv_sem)

hist(sqrt(data$desv_sem))

desv_sem <- data$desv_sem

names(data)[4] # prom_bim: promedio del máximo de moras en el bimestre anterior
## [1] "prom_bim"
hist(data$prom_bim)

hist(sqrt(data$prom_bim))

prom_bim <- sqrt(data$prom_bim)

names(data)[5] # max_mes_anterior: días de mora máximo en el mes anterior
## [1] "max_mes_anterior"
hist(data$max_mes_anterior)

hist(sqrt(data$max_mes_anterior))

max_mes_anterior <- data$max_mes_anterior

names(data)[6] # prom_mes_anterior: Promedio de los dias de mora en el mes anterior
## [1] "prom_mes_anterior"
hist(data$prom_mes_anterior)

hist(sqrt(data$prom_mes_anterior))

prom_mes_anterior <- sqrt(data$prom_mes_anterior)

names(data)[7] # prom_sem: promedio del máximo de moras en el semestre anterior
## [1] "prom_sem"
hist(data$prom_sem)

hist(sqrt(data$prom_sem))

prom_sem <- data$prom_sem

names(data)[8] # max_bim: máximo días de mora en el bimestre anterior
## [1] "max_bim"
hist(data$max_bim)

hist(sqrt(data$max_bim))

max_bim <- data$max_bim

names(data)[9] # mejor_gestion: Mejor gestion realizada
## [1] "mejor_gestion"
table(data$mejor_gestion)
## 
##     0     5     6     7    16 
## 18031     1     3     1    10
names(data)[10] # prom_trim: promedio del máximo de moras en el trimestre anterior
## [1] "prom_trim"
hist(data$prom_trim)

hist(sqrt(data$prom_trim))

prom_trim <- sqrt(data$prom_trim)

names(data)[11] # pc_cant_moras_30_ult_12_meses: Cantidad de moras 30 en los últimos 12 meses <= 5 o vacío.
## [1] "pc_cant_moras_30_ult_12_meses"
summary(data$pc_cant_moras_30_ult_12_meses)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.298   1.000  12.000
table(data$pc_cant_moras_30_ult_12_meses)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 11672  2024  1146   919   500   333   321   188   182   192   196   176   197
pc_cant_moras_30_ult_12_meses <- data$pc_cant_moras_30_ult_12_meses

names(data)[12] # desv_trim: desviación estándar del máximo de moras en el trimestre  anterior
## [1] "desv_trim"
hist(data$desv_trim)

hist(sqrt(data$desv_trim))

desv_trim <- sqrt(data$desv_trim)

names(data)[13] # nro_gestiones: Numero de gestiones realizadas
## [1] "nro_gestiones"
table(data$nro_gestiones)
## 
##     0     1     2     3     5 
## 18031     6     4     4     1
names(data)[14] # desv_bim: desviación estándar del máximo días de mora en el bimestre anterior
## [1] "desv_bim"
hist(data$desv_bim)

hist(sqrt(data$desv_bim))

desv_bim <- sqrt(data$desv_bim)

names(data)[15] # pc_cant_moras_30_ult_3_meses: Cantidad de moras 30 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_30_ult_3_meses"
summary(data$pc_cant_moras_30_ult_3_meses)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4745  0.0000  3.0000
hist(data$pc_cant_moras_30_ult_3_meses)

names(data)[16] # dh_cant_entradas: cantidad de trasacciones de ingreso de dinero tuvo en el mes anterior
## [1] "dh_cant_entradas"
summary(data$dh_cant_entradas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    5.00   11.52   17.00  139.00
hist(data$dh_cant_entradas)

hist(sqrt(data$dh_cant_entradas))

dh_cant_entradas <- sqrt(data$dh_cant_entradas)

names(data)[17] # pc_tiem_1er_prod_abierto_total: Tiempo total de producto abierto
## [1] "pc_tiem_1er_prod_abierto_total"
table(data$pc_tiem_1er_prod_abierto_total)
## 
##     0     2     3     5    11    14    16    17    18    19    21    22    24 
## 17794     6     2     5     3     1     5     1     2     1     4     1     1 
##    25    26    27    28    29    30    31    33    34    35    38    39    41 
##     2     5     2     5     4     2     5     2     4     1     2    35     2 
##    42    44    46    48    49    50    51    52    53    54    55    56    57 
##     1     3     3     2    21     4     2     2     1     5     4     2     2 
##    58    59    60    61    62    63    65    66    67    68    69    70    71 
##     9     3     1     1     3     2     2     4     2     8     3     3     2 
##    74    77    79    81    82    88    94    97   104   105   107   108   110 
##     1     1     2     1     4     1     2     3     2     1     1     3     1 
##   111   115   117   121   124   126   128   129   138   141   143   144   146 
##     1     1     1     1     2     3     2     1     6     3     1     1     2 
##   169   171   176   179   299 
##     1     1     2     1     1
hist(data$pc_tiem_1er_prod_abierto_total)

names(data)[18] # pc_cant_moras_60_ult_12_meses: Cantidad de moras 60 en los últimos 12 meses <= 1 o vacío.
## [1] "pc_cant_moras_60_ult_12_meses"
table(data$pc_cant_moras_60_ult_12_meses)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 14905  1052   528   353   230   130   147   175   128   132   105    87    74
hist(data$pc_cant_moras_60_ult_12_meses) # se optará por hacerla binaria

x <- ifelse(data$pc_cant_moras_60_ult_12_meses >= 1, 1, 0)
table(x)
## x
##     0     1 
## 14905  3141
flag_mora60_ult12meses <- x

names(data)[19] # gestiones_eficaces: Cantidad de gestiones eficaces
## [1] "gestiones_eficaces"
table(data$gestiones_eficaces)
## 
##     0     1     2     3 
## 18031    11     3     1
names(data)[20] # pc_transaccional: Ingreso de acuerdo al estimador transaccional del cliente
## [1] "pc_transaccional"
summary(data$pc_transaccional)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  2043200  7547230 14096941 19069978 92269310
boxplot(data$pc_transaccional) # valores atípicos en el lado superior de la variable

quantile(data$pc_transaccional, seq(0, 1, by = 0.01))
##         0%         1%         2%         3%         4%         5%         6% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##         7%         8%         9%        10%        11%        12%        13% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        14%        15%        16%        17%        18%        19%        20% 
##        0.0        0.0    57600.0   248650.0   410806.8   665000.0   910000.0 
##        21%        22%        23%        24%        25%        26%        27% 
##  1089945.0  1381618.2  1616750.0  1840077.7  2043200.0  2266212.4  2500000.0 
##        28%        29%        30%        31%        32%        33%        34% 
##  2667730.5  2880776.6  3000000.0  3192751.5  3356433.2  3512153.2  3723150.6 
##        35%        36%        37%        38%        39%        40%        41% 
##  3963014.5  4201883.2  4410000.0  4560285.8  4812979.0  5058932.8  5239323.0 
##        42%        43%        44%        45%        46%        47%        48% 
##  5555416.6  5822665.3  6149353.8  6386744.2  6600000.0  6810155.7  7000000.0 
##        49%        50%        51%        52%        53%        54%        55% 
##  7276333.0  7547229.5  7944952.3  8239311.7  8461116.5  8765504.8  9050000.0 
##        56%        57%        58%        59%        60%        61%        62% 
##  9400018.2  9727957.2 10067467.5 10400000.0 10780000.0 11350000.0 11863116.3 
##        63%        64%        65%        66%        67%        68%        69% 
## 12154410.1 12560835.0 12988541.8 13632351.8 14295950.0 14702743.6 15137238.4 
##        70%        71%        72%        73%        74%        75%        76% 
## 15629789.5 16275052.0 16971838.3 17545912.3 18452323.0 19069977.5 19750000.0 
##        77%        78%        79%        80%        81%        82%        83% 
## 20328585.3 21272355.0 22094848.0 23043772.4 24400000.0 25657795.3 27167290.0 
##        84%        85%        86%        87%        88%        89%        90% 
## 28450000.0 30005655.5 31122150.0 32515000.0 34360117.8 36540016.0 38620000.0 
##        91%        92%        93%        94%        95%        96%        97% 
## 41672027.0 43349004.2 45725673.9 49521221.0 52488819.3 57918192.0 63254555.0 
##        98%        99%       100% 
## 70506644.0 79003883.0 92269310.0
x <- subset(data$pc_transaccional, data$pc_transaccional <= 92269310.0)
hist(x)

pc_transaccional <- data$pc_transaccional

names(data)[21] # dh_max_dia_entradas: Ultimo dia en que recibio alguna transaccion de ingreso de dinero
## [1] "dh_max_dia_entradas"
table(data$dh_max_dia_entradas)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 6166   10   31   11   17   27   29   27   46   51   53   46   83   65   97   71 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##   92   79   69   74  105  155  168  201  214  183  196  331 1360  449 2911 4629
hist(data$dh_max_dia_entradas) # se volverá binaria

x <- ifelse(data$dh_max_dia_entradas <= 15, 1, 0)
table(x)
## x
##     0     1 
## 11216  6830
flag_ultima_entrada_1quincena <- x

names(data)[22] # pc_cupo_entidad: Cupo de las tarjetas de crédito en el banco
## [1] "pc_cupo_entidad"
summary(data$pc_cupo_entidad)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0   137908        0 36000000
hist(data$pc_cupo_entidad)

table(ifelse(data$pc_cupo_entidad >= 1, 1, 0))
## 
##     0     1 
## 17834   212
names(data)[23] # pc_cuotas_como_ppal: Cuotas pagadas como principal
## [1] "pc_cuotas_como_ppal"
summary(data$pc_cuotas_como_ppal)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   49160       0 8156850
hist(data$pc_cuotas_como_ppal)

names(data)[24] # dh_val_entradas: Valor total de los ingresos tomados en el mes anterior
## [1] "dh_val_entradas"
summary(data$dh_val_entradas)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   5088944  18769832  26188088 204764342
hist(data$dh_val_entradas)

hist(sqrt(data$dh_val_entradas))

dh_val_entradas <- sqrt(data$dh_val_entradas)

names(data)[25] # pc_cant_moras_90_ult_12_meses: Cantidad de moras 90 o superiores en los últimos 12 meses = 0 o vacío.
## [1] "pc_cant_moras_90_ult_12_meses"
table(data$pc_cant_moras_90_ult_12_meses)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 16250   454   310   163   149   151   106   108   111    92    69    33    50
hist(data$pc_cant_moras_90_ult_12_meses)

hist(sqrt(data$pc_cant_moras_90_ult_12_meses))

names(data)[26] # dh_max_dia_salidas
## [1] "dh_max_dia_salidas"
# prefiero quedarme con la misma variable pero para el mes actual

names(data)[27] # pc_cant_moras_60_ult_3_meses: Cantidad de moras 60 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_60_ult_3_meses"
table(data$pc_cant_moras_60_ult_3_meses)
## 
##     0     1     2     3 
## 15713   833   443  1057
table(ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0))
## 
##     0     1 
## 15713  2333
flag_tuvo_mora60_ult3meses <- ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0)

names(data)[28] # pc_cuota_tarjeta_de_credito: Cuota de tarjeta de crédito  reportada por CIFIN
## [1] "pc_cuota_tarjeta_de_credito"
summary(data$pc_cuota_tarjeta_de_credito)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   14910       0 5445000
hist(data$pc_cuota_tarjeta_de_credito)

table(ifelse(data$pc_cuota_tarjeta_de_credito >= 1, 1, 0))
## 
##     0     1 
## 17803   243
names(data)[29] # cp_inicial_menos_saldo: valor inicial menos el saldo en el mes anterior por producto
## [1] "cp_inicial_menos_saldo"
summary(data$cp_inicial_menos_saldo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[30] # pc_peor_estado_act_cta_aho: Peor estado cuenta ahorro
## [1] "pc_peor_estado_act_cta_aho"
summary(data$pc_peor_estado_act_cta_aho)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04034 0.00000 3.00000
hist(data$pc_peor_estado_act_cta_aho)

names(data)[31] # dia_pago: Dia de pago de la obligacion en el mes
## [1] "dia_pago"
table(data$dia_pago)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##  599  314 4072  227  238  226  258  203  210  208  278  233  277  215 5161  245 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
## 1856  221  270  200  220  283  235  206  232  250  267  274  188  333   47
x <- ifelse(data$dia_pago <= 15, 1, 0)
table(x)
## x
##     0     1 
##  5327 12719
flag_diaPago_1quincena <- x

names(data)[32] # cp_cuotas_falta: Cantidad de cuotas faltantes
## [1] "cp_cuotas_falta"
table(data$cp_cuotas_falta)
## 
##     0 
## 18046
names(data)[33] # pcons_tarjeta_de_credito: Es el producto una tarjeta de credito
## [1] "pcons_tarjeta_de_credito"
table(data$pcons_tarjeta_de_credito)
## 
##     0     1 
## 12404  5642
pcons_tarjeta_de_credito <- data$pcons_tarjeta_de_credito

names(data)[34] # pc_cifin: Ingreso del cliente de acuerdo a CIFIN
## [1] "pc_cifin"
summary(data$pc_cifin)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0    77941        0 23509011
hist(data$pc_cifin)

names(data)[35] # gestiones_prod: Numero de gestiones productivas
## [1] "gestiones_prod"
table(data$gestiones_prod)
## 
##     0     1     2     3 
## 18036     8     1     1
names(data)[36] # pcons_vehiculos_sufi: Es el producto un vehiculo sufi
## [1] "pcons_vehiculos_sufi"
table(data$pcons_vehiculos_sufi)
## 
##     0     1 
## 16990  1056
names(data)[37] # cluster_recod: segmento cluster
## [1] "cluster_recod"
table(data$cluster_recod)
## 
##    1    6    7    9   13   16   19 
##   99 9105 5284   49 1867 1008  634
x <- ifelse(data$cluster_recod == 6, 1, 0)
table(x)
## x
##    0    1 
## 8941 9105
flag_es_cluster_6 <- x

names(data)[38] # dh_avg_dia_retiros_d: Dia promedio del mes en el que realiza los retiros 
## [1] "dh_avg_dia_retiros_d"
hist(data$dh_avg_dia_retiros_d)

dh_avg_dia_retiros_d <- data$dh_avg_dia_retiros_d

names(data)[39] # dmi_max_egreso_diario: Maximo egreso en un dia del mes anterior 
## [1] "dmi_max_egreso_diario"
summary(data$dmi_max_egreso_diario)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0  2396128  6685783  9264177 85362328
hist(data$dmi_max_egreso_diario)

quantile(data$dmi_max_egreso_diario, seq(0,1,by = 0.01))
##         0%         1%         2%         3%         4%         5%         6% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##         7%         8%         9%        10%        11%        12%        13% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        14%        15%        16%        17%        18%        19%        20% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        21%        22%        23%        24%        25%        26%        27% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        28%        29%        30%        31%        32%        33%        34% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        35%        36%        37%        38%        39%        40%        41% 
##        0.0    15805.6   108247.0   355864.7   579439.1   791444.0  1003106.8 
##        42%        43%        44%        45%        46%        47%        48% 
##  1152466.0  1308571.8  1472964.0  1600000.0  1782669.0  1888031.4  2008000.0 
##        49%        50%        51%        52%        53%        54%        55% 
##  2170653.4  2396128.5  2579869.1  2765324.0  2936515.0  3076251.0  3276314.0 
##        56%        57%        58%        59%        60%        61%        62% 
##  3537956.0  3805458.8  4000000.0  4148169.0  4461603.0  4688264.5  4987219.0 
##        63%        64%        65%        66%        67%        68%        69% 
##  5041687.1  5342686.0  5668527.5  6001403.5  6279611.5  6622351.2  7028000.0 
##        70%        71%        72%        73%        74%        75%        76% 
##  7400503.0  7733143.0  8032000.0  8416410.3  8911504.0  9264177.0  9856840.0 
##        77%        78%        79%        80%        81%        82%        83% 
## 10040000.0 10276101.0 10696562.4 11244800.0 11856206.8 12224783.0 12825890.5 
##        84%        85%        86%        87%        88%        89%        90% 
## 13536058.6 14305838.5 15037327.9 15742756.1 16566000.0 17589063.0 18992449.0 
##        91%        92%        93%        94%        95%        96%        97% 
## 20080000.0 21146267.0 22772981.0 25016338.8 27306876.0 30278258.2 35581436.4 
##        98%        99%       100% 
## 40866473.0 52347170.7 85362328.0
x <- subset(data$dmi_max_egreso_diario, data$dmi_max_egreso_diario <= 85362328)
hist(x)

dmi_max_egreso_diario <- data$dmi_max_egreso_diario

names(data)[40] # cpc_max_proc_deuda: Máximo del porcentaje de la deuda en el mes anterior
## [1] "cpc_max_proc_deuda"
summary(data$cpc_max_proc_deuda)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -28.1820   0.0000   0.2023   0.4324   0.9399  13.8796
names(data)[41] # dh_cant_otros_d: Cantidad de trasacciones de salida por concepto de otros
## [1] "dh_cant_otros_d"
summary(data$dh_cant_otros_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   3.092   4.000  74.000
hist(data$dh_cant_otros_d)

hist(sqrt(data$dh_cant_otros_d))

dh_cant_otros_d <- sqrt(data$dh_cant_otros_d)

names(data)[42] # pc_cont_30_lt_12m_tot_sf: Cantidad mora 30 últimos 12 meses sector financiero
## [1] "pc_cont_30_lt_12m_tot_sf"
summary(data$pc_cont_30_lt_12m_tot_sf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0681  0.0000 64.0000
hist(data$pc_cont_30_lt_12m_tot_sf)

table(data$pc_cont_30_lt_12m_tot_sf)
## 
##     0     1     2     3     5     6    64 
## 17999    13     5     3     3     5    18
names(data)[43] # pc_cant_mora90_ult_12m_total
## [1] "pc_cant_mora90_ult_12m_total"
table(data$pc_cant_mora90_ult_12m_total)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
## 16882   212   145   123    84   121    79   102    97    82    57    27    35
table(ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0))
## 
##     0     1 
## 16882  1164
x <- ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0)
flag_tuvo_mora90_ult12M <- x

names(data)[44] # dmi_max_ingreso_diario: Maximo ingreso en un dia del mes anterior 
## [1] "dmi_max_ingreso_diario"
summary(data$dmi_max_ingreso_diario)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   2540003   7845783  10000027 103018788
hist(data$dmi_max_ingreso_diario)

quantile(data$dmi_max_ingreso_diario, seq(0, 1, by = 0.01))
##          0%          1%          2%          3%          4%          5% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##          6%          7%          8%          9%         10%         11% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         12%         13%         14%         15%         16%         17% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         18%         19%         20%         21%         22%         23% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         24%         25%         26%         27%         28%         29% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         30%         31%         32%         33%         34%         35% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         36%         37%         38%         39%         40%         41% 
##         7.2     22203.0    195000.0    490000.0    700000.0    960900.0 
##         42%         43%         44%         45%         46%         47% 
##   1092137.9   1300000.0   1500000.0   1656152.0   1900000.0   2000000.0 
##         48%         49%         50%         51%         52%         53% 
##   2100000.0   2328568.1   2540003.0   2750003.0   3000000.0   3050000.0 
##         54%         55%         56%         57%         58%         59% 
##   3341821.6   3600000.0   3950000.0   4032169.0   4300000.0   4594069.3 
##         60%         61%         62%         63%         64%         65% 
##   4950000.0   5000000.0   5193296.1   5500000.0   6000000.0   6190591.0 
##         66%         67%         68%         69%         70%         71% 
##   6525936.2   7000000.0   7262786.6   7724640.1   8000001.0   8560257.4 
##         72%         73%         74%         75%         76%         77% 
##   9000000.0   9604260.0  10000000.0  10000026.8  10340000.0  10842262.6 
##         78%         79%         80%         81%         82%         83% 
##  11175227.0  12000000.0  12437389.0  13000000.0  13758802.2  14462555.0 
##         84%         85%         86%         87%         88%         89% 
##  15200041.0  16124906.2  17541904.3  18687254.4  19862000.0  20148078.3 
##         90%         91%         92%         93%         94%         95% 
##  21350000.0  23972400.0  25872101.6  28563056.0  30227878.6  33514149.0 
##         96%         97%         98%         99%        100% 
##  38000000.0  43250075.4  50045136.0  67883640.4 103018788.0
x <- subset(data$dmi_max_ingreso_diario, data$dmi_max_ingreso_diario <= 104909227.3)
hist(x)

dmi_max_ingreso_diario <- data$dmi_max_ingreso_diario

names(data)[45] # dh_val_otros_d: Valor total de las trasacciones de salida por concepto de otros en un mes
## [1] "dh_val_otros_d"
summary(data$dh_val_otros_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0    7586  513468  396635 9932459
hist(data$dh_val_otros_d)

quantile(data$dh_val_otros_d, seq(0,1,by=0.01))
##         0%         1%         2%         3%         4%         5%         6% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##         7%         8%         9%        10%        11%        12%        13% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        14%        15%        16%        17%        18%        19%        20% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        21%        22%        23%        24%        25%        26%        27% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        28%        29%        30%        31%        32%        33%        34% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        35%        36%        37%        38%        39%        40%        41% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        42%        43%        44%        45%        46%        47%        48% 
##       0.00       0.00       0.00       0.00       0.00     900.00    1919.00 
##        49%        50%        51%        52%        53%        54%        55% 
##    3222.00    7586.00   10100.00   12483.00   13271.00   13271.00   14360.50 
##        56%        57%        58%        59%        60%        61%        62% 
##   17452.60   20760.00   23736.00   29835.00   36771.00   49855.45   62593.60 
##        63%        64%        65%        66%        67%        68%        69% 
##   75757.00   90129.00  104071.00  125011.80  146179.00  175034.00  202801.25 
##        70%        71%        72%        73%        74%        75%        76% 
##  229524.50  253039.00  280656.40  311751.40  348722.00  396634.75  436151.80 
##        77%        78%        79%        80%        81%        82%        83% 
##  482810.00  528182.00  573716.00  626437.00  689522.20  759090.00  837712.00 
##        84%        85%        86%        87%        88%        89%        90% 
##  934669.60 1013325.00 1107249.00 1229253.00 1340862.20 1509363.45 1692885.00 
##        91%        92%        93%        94%        95%        96%        97% 
## 1844536.00 2006918.00 2257572.45 2517164.00 2796438.00 3337151.80 4026117.00 
##        98%        99%       100% 
## 4825491.20 6503208.90 9932459.00
x <- subset(data$dh_val_otros_d, data$dh_val_otros_d <= 9998335.80)
hist(x)

dh_val_otros_d <- data$dh_val_otros_d

names(data)[46] # pc_ingreso_final: Ingreso de final del cliente
## [1] "pc_ingreso_final"
summary(data$pc_ingreso_final)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  2000000  7221917 13140723 17741170 92226100
hist(data$pc_ingreso_final)

pc_ingreso_final <- data$pc_ingreso_final

names(data)[47] # dh_cant_pagos_d: Cantidad de pagos de salidas tuvo en el mes anterior
## [1] "dh_cant_pagos_d"
summary(data$dh_cant_pagos_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   3.000   4.864   8.000  49.000
hist(data$dh_cant_pagos_d)

dh_cant_pagos_d <- data$dh_cant_pagos_d

names(data)[48] # dmi_ingreso_total_mes: Ingreso total del mes anterior
## [1] "dmi_ingreso_total_mes"
summary(data$dmi_ingreso_total_mes)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   5088944  18769832  26188088 204764342
hist(data$dmi_ingreso_total_mes)

quantile(data$dmi_ingreso_total_mes, seq(0, 1, by = 0.01))
##           0%           1%           2%           3%           4%           5% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##           6%           7%           8%           9%          10%          11% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          12%          13%          14%          15%          16%          17% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          18%          19%          20%          21%          22%          23% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          24%          25%          26%          27%          28%          29% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          30%          31%          32%          33%          34%          35% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          36%          37%          38%          39%          40%          41% 
##         8.00     24049.25    250005.50    630000.00   1056000.00   1400006.45 
##          42%          43%          44%          45%          46%          47% 
##   1838897.00   2199726.35   2600000.00   3000000.00   3300002.00   3790873.90 
##          48%          49%          50%          51%          52%          53% 
##   4106175.00   4564100.00   5088944.00   5550008.00   6004109.00   6639751.45 
##          54%          55%          56%          57%          58%          59% 
##   7250008.40   8010000.00   8667401.40   9450000.00  10150000.00  10675000.00 
##          60%          61%          62%          63%          64%          65% 
##  11550000.00  12200000.00  13019188.80  13904977.15  14787240.80  15525970.00 
##          66%          67%          68%          69%          70%          71% 
##  16474081.20  17555134.00  18585215.00  19600704.25  20290000.00  21541384.00 
##          72%          73%          74%          75%          76%          77% 
##  22528017.00  23630000.00  25156589.10  26188088.00  27948415.80  29405868.00 
##          78%          79%          80%          81%          82%          83% 
##  30600887.80  32212302.85  34000005.00  35944963.75  37561559.20  39459092.60 
##          84%          85%          86%          87%          88%          89% 
##  41691250.00  44302738.75  46987343.00  49143482.00  51159723.00  54010666.00 
##          90%          91%          92%          93%          94%          95% 
##  57475971.50  61466784.45  65288828.60  68862425.00  74272415.00  81843676.25 
##          96%          97%          98%          99%         100% 
##  88714237.00  98130984.65 111743381.40 132321500.00 204764342.00
x <- subset(data$dmi_ingreso_total_mes, data$dmi_ingreso_total_mes <= 205163705.0)
hist(x)

dmi_ingreso_total_mes <- data$dmi_ingreso_total_mes

names(data)[49] # dh_val_pagos_d: Valor total de las trasacciones de salida por concepto de pagos otros
## [1] "dh_val_pagos_d"
summary(data$dh_val_pagos_d)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0   551214  3019832  3753468 50357360
hist(data$dh_val_pagos_d)

quantile(data$dh_val_pagos_d, seq(0, 1, by = 0.01))
##         0%         1%         2%         3%         4%         5%         6% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##         7%         8%         9%        10%        11%        12%        13% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        14%        15%        16%        17%        18%        19%        20% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        21%        22%        23%        24%        25%        26%        27% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        28%        29%        30%        31%        32%        33%        34% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        35%        36%        37%        38%        39%        40%        41% 
##        0.0        0.0        0.0        0.0     1953.0    20000.0    40345.0 
##        42%        43%        44%        45%        46%        47%        48% 
##    63499.9    95093.0   127370.0   167836.2   227337.3   298628.5   373379.0 
##        49%        50%        51%        52%        53%        54%        55% 
##   458507.0   551214.0   645925.8   749470.0   854844.9   947913.0  1051616.0 
##        56%        57%        58%        59%        60%        61%        62% 
##  1162545.0  1283067.2  1388163.9  1500013.5  1610615.0  1710256.8  1847643.0 
##        63%        64%        65%        66%        67%        68%        69% 
##  1956650.8  2070132.4  2221932.8  2344112.0  2504857.0  2665210.0  2818400.0 
##        70%        71%        72%        73%        74%        75%        76% 
##  2961302.0  3074574.8  3221620.4  3432337.0  3589801.0  3753468.5  3964881.6 
##        77%        78%        79%        80%        81%        82%        83% 
##  4183091.0  4404183.0  4666203.0  4931637.0  5219968.5  5461909.0  5688066.2 
##        84%        85%        86%        87%        88%        89%        90% 
##  6002214.6  6270027.5  6668134.8  7100857.4  7530003.8  8157880.3  8782633.0 
##        91%        92%        93%        94%        95%        96%        97% 
##  9337860.7 10308822.0 11291463.0 12556160.2 13782306.0 15175063.0 17581258.2 
##        98%        99%       100% 
## 20688616.2 27004379.6 50357360.0
x <- subset(data$dh_val_pagos_d, data$dh_val_pagos_d <= 50357360.00)
hist(x)

dh_val_pagos_d <- data$dh_val_pagos_d

names(data)[50] # pc_gasto_familiar: Valor Gasto de familiar del cliente
## [1] "pc_gasto_familiar"
summary(data$pc_gasto_familiar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0   723816  2070132  3368379  4335488 23067328
quantile(data$pc_gasto_familiar, seq(0,1,by=0.01))
##         0%         1%         2%         3%         4%         5%         6% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##         7%         8%         9%        10%        11%        12%        13% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        14%        15%        16%        17%        18%        19%        20% 
##        0.0        0.0    52827.5   144582.5   246500.0   350000.0   430100.0 
##        21%        22%        23%        24%        25%        26%        27% 
##   521914.7   588486.8   637956.8   687452.3   723815.7   763864.1   782813.0 
##        28%        29%        30%        31%        32%        33%        34% 
##   803882.1   836711.5   861377.8   882000.0   931155.0   975490.5  1039732.4 
##        35%        36%        37%        38%        39%        40%        41% 
##  1088643.0  1146363.2  1202858.0  1262250.0  1306491.8  1363816.0  1441944.3 
##        42%        43%        44%        45%        46%        47%        48% 
##  1519899.3  1583532.9  1653079.6  1712143.6  1775832.6  1845026.6  1910025.3 
##        49%        50%        51%        52%        53%        54%        55% 
##  2009880.8  2070131.5  2113333.7  2180887.5  2241724.8  2292477.7  2340728.6 
##        56%        57%        58%        59%        60%        61%        62% 
##  2403626.6  2454449.8  2507395.5  2550000.0  2582812.1  2629610.5  2698945.9 
##        63%        64%        65%        66%        67%        68%        69% 
##  2785419.6  2861250.0  2982375.6  3111782.2  3124333.0  3230361.3  3359112.7 
##        70%        71%        72%        73%        74%        75%        76% 
##  3545186.2  3722838.1  3892993.9  4043832.1  4196875.0  4335487.7  4530732.8 
##        77%        78%        79%        80%        81%        82%        83% 
##  4670284.2  4871802.6  5111082.5  5362225.0  5630442.3  5912383.7  6253169.6 
##        84%        85%        86%        87%        88%        89%        90% 
##  6542500.0  6842430.6  7286724.5  7673500.0  8172391.9  8594050.8  9055328.7 
##        91%        92%        93%        94%        95%        96%        97% 
##  9534963.8 10206885.0 10737379.9 11311996.9 12263970.4 13361803.4 14807103.4 
##        98%        99%       100% 
## 16718602.0 18943025.0 23067327.5
x <- subset(data$pc_gasto_familiar, data$pc_gasto_familiar <= 26582180.19)
hist(x)

pc_gasto_familiar <- data$pc_gasto_familiar

names(data)[51] # pc_cuotas_pagadas: Valor Cuotas de pagadas del cliente
## [1] "pc_cuotas_pagadas"
summary(data$pc_cuotas_pagadas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0  290000  704829  900899 8156850
hist(data$pc_cuotas_pagadas)

quantile(data$pc_cuotas_pagadas, seq(0, 1, by = 0.01))
##        0%        1%        2%        3%        4%        5%        6%        7% 
##       0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0 
##        8%        9%       10%       11%       12%       13%       14%       15% 
##       0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0 
##       16%       17%       18%       19%       20%       21%       22%       23% 
##       0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0 
##       24%       25%       26%       27%       28%       29%       30%       31% 
##       0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0 
##       32%       33%       34%       35%       36%       37%       38%       39% 
##       0.0       0.0   43500.0   74571.4   95700.0  116000.0  118769.8  145000.0 
##       40%       41%       42%       43%       44%       45%       46%       47% 
##  150348.9  174000.0  184800.0  203700.0  216000.0  232000.0  240000.0  258000.0 
##       48%       49%       50%       51%       52%       53%       54%       55% 
##  284200.0  290000.0  290000.0  297478.3  312788.1  334276.0  346446.0  370968.9 
##       56%       57%       58%       59%       60%       61%       62%       63% 
##  392000.0  400305.1  410880.6  438368.6  460600.0  486710.1  490000.0  507349.8 
##       64%       65%       66%       67%       68%       69%       70%       71% 
##  528942.6  559000.0  602000.0  635938.0  652435.4  680000.0  727600.0  748420.0 
##       72%       73%       74%       75%       76%       77%       78%       79% 
##  789688.2  836217.2  860000.0  900898.7  935279.0  981092.5 1010402.6 1045573.2 
##       80%       81%       82%       83%       84%       85%       86%       87% 
## 1100000.0 1169898.4 1220208.2 1306574.5 1380618.1 1459464.8 1560829.4 1625384.1 
##       88%       89%       90%       91%       92%       93%       94%       95% 
## 1700700.2 1769856.3 1841142.1 2033524.0 2200101.2 2360000.0 2570558.7 2894792.0 
##       96%       97%       98%       99%      100% 
## 3178855.5 3683356.2 4743779.9 6010000.0 8156850.0
x <- subset(data$pc_cuotas_pagadas, data$pc_cuotas_pagadas <= 8156850.0)
hist(x)

pc_cuotas_pagadas <- data$pc_cuotas_pagadas

names(data)[52] # cpc_avg_proc_deuda: Promedio del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_avg_proc_deuda"
summary(data$cpc_avg_proc_deuda)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -119.1284    0.0000    0.1820    0.3917    0.8733   13.8796
hist(data$cpc_avg_proc_deuda)

quantile(data$cpc_avg_proc_deuda, seq(0,1,by=0.01))
##            0%            1%            2%            3%            4% 
## -119.12840000    0.00000000    0.00000000    0.00000000    0.00000000 
##            5%            6%            7%            8%            9% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           10%           11%           12%           13%           14% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           15%           16%           17%           18%           19% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           20%           21%           22%           23%           24% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           25%           26%           27%           28%           29% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           30%           31%           32%           33%           34% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           35%           36%           37%           38%           39% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           40%           41%           42%           43%           44% 
##    0.00000000    0.00000000    0.00000000    0.00000000    0.00000000 
##           45%           46%           47%           48%           49% 
##    0.00317328    0.01566387    0.05337072    0.09155538    0.14000205 
##           50%           51%           52%           53%           54% 
##    0.18205318    0.22012374    0.25548425    0.28493140    0.32651283 
##           55%           56%           57%           58%           59% 
##    0.37242599    0.40897600    0.43696430    0.46846924    0.50734022 
##           60%           61%           62%           63%           64% 
##    0.53930356    0.56982057    0.59705567    0.63020920    0.65465374 
##           65%           66%           67%           68%           69% 
##    0.68039150    0.70248708    0.72589106    0.74845785    0.77251605 
##           70%           71%           72%           73%           74% 
##    0.79321046    0.81041248    0.82971103    0.84219350    0.85846768 
##           75%           76%           77%           78%           79% 
##    0.87328461    0.88886939    0.90253529    0.91428714    0.92869600 
##           80%           81%           82%           83%           84% 
##    0.94124096    0.95355144    0.96537928    0.97453072    0.98651367 
##           85%           86%           87%           88%           89% 
##    1.00066241    1.01072154    1.02341465    1.04034223    1.05742733 
##           90%           91%           92%           93%           94% 
##    1.07356220    1.08732512    1.09956045    1.11127967    1.12717757 
##           95%           96%           97%           98%           99% 
##    1.14653787    1.16522691    1.18974892    1.20539930    1.22587058 
##          100% 
##   13.87960000
names(data)[53] # cpc_sum_proc_deuda: Suma del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_sum_proc_deuda"
summary(data$cpc_sum_proc_deuda)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -119.1284    0.0000    0.1820    0.3917    0.8733   13.8796
hist(data$cpc_sum_proc_deuda)

names(data)[54] # dc_porc_prod_sin_mora: Porcentaje de productos sin mora en todo el sistema
## [1] "dc_porc_prod_sin_mora"
summary(data$dc_porc_prod_sin_mora)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.01288 0.00000 1.00000
hist(data$dc_porc_prod_sin_mora)

quantile(data$dc_porc_prod_sin_mora, seq(0,1,by=0.01))
##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  96%  97%  98%  99% 100% 
##    0    0    0    1    1
names(data)[55] # pc_ingreso_rutina_con_techo: Ingreso por rutina una vez aplicada los techos por segmento
## [1] "pc_ingreso_rutina_con_techo"
summary(data$pc_ingreso_rutina_con_techo)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  2122000  7576047 14120244 19076750 92269310
hist(data$pc_ingreso_rutina_con_techo)

pc_ingreso_rutina_con_techo <- data$pc_ingreso_rutina_con_techo

names(data)[56] # pc_saldo_prom3_tdc_entidad: Saldo promedio de los últimos 3 meses de tarjeta de crédito en el banco
## [1] "pc_saldo_prom3_tdc_entidad"
summary(data$pc_saldo_prom3_tdc_entidad)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0    92238        0 30110000
hist(data$pc_saldo_prom3_tdc_entidad)

names(data)[57] # dh_cant_salidas: Cantidad de trasacciones de salida de dinero en un mes
## [1] "dh_cant_salidas"
summary(data$dh_cant_salidas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   15.00   27.59   42.00  404.00
hist(data$dh_cant_salidas)

dh_cant_salidas <- data$dh_cant_salidas

names(data)[58] # dh_min_dia_pagos_d: Primer dia en el que realizó algún credito en el mes anterior
## [1] "dh_min_dia_pagos_d"
summary(data$dh_min_dia_pagos_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   3.952   5.000  31.000
hist(data$dh_min_dia_pagos_d)

dh_min_dia_pagos_d <- data$dh_min_dia_pagos_d

names(data)[59] # pc_ingreso_por_rutina: Ingreso por rutina
## [1] "pc_ingreso_por_rutina"
summary(data$pc_ingreso_por_rutina)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  2122000  7576047 14120244 19076750 92269310
hist(data$pc_ingreso_por_rutina)

pc_ingreso_por_rutina <- data$pc_ingreso_por_rutina

names(data)[60] # dh_min_dia_pago_tarj_d: Primer dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_min_dia_pago_tarj_d"
hist(data$dh_min_dia_pago_tarj_d)

names(data)[61] # cp_nro_cuota: Numero de cuota pactadas por producto
## [1] "cp_nro_cuota"
summary(data$cp_nro_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[62] # dmi_egreso_total_mes: Egreso total del mes anterior
## [1] "dmi_egreso_total_mes"
summary(data$dmi_egreso_total_mes)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   5735566  19552715  28140390 204098909
hist(data$dmi_egreso_total_mes)

quantile(data$dmi_egreso_total_mes, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##          6%          7%          8%          9%         10%         11% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         12%         13%         14%         15%         16%         17% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         18%         19%         20%         21%         22%         23% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         24%         25%         26%         27%         28%         29% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         30%         31%         32%         33%         34%         35% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         36%         37%         38%         39%         40%         41% 
##     20098.2    150464.5    469809.0   1007389.1   1364407.0   1757330.9 
##         42%         43%         44%         45%         46%         47% 
##   2139132.8   2597761.0   3055581.0   3491431.8   3875658.7   4341015.8 
##         48%         49%         50%         51%         52%         53% 
##   4728541.2   5246081.5   5735566.0   6222528.9   6750806.4   7463828.4 
##         54%         55%         56%         57%         58%         59% 
##   8156806.7   8903706.2   9676725.2  10353244.3  10949690.0  11653537.0 
##         60%         61%         62%         63%         64%         65% 
##  12507194.0  13308026.2  14158401.7  14949690.7  15668673.0  16852909.0 
##         66%         67%         68%         69%         70%         71% 
##  17823161.3  18701866.2  19649493.0  20633345.1  21568478.0  22936048.2 
##         72%         73%         74%         75%         76%         77% 
##  24164720.0  25304132.1  26608243.8  28140390.0  29543323.8  30673503.0 
##         78%         79%         80%         81%         82%         83% 
##  32490727.9  34304664.0  36063171.0  37668819.0  39804887.1  41664563.1 
##         84%         85%         86%         87%         88%         89% 
##  43936740.2  46612353.5  48890887.4  50950735.5  52832299.8  55616061.5 
##         90%         91%         92%         93%         94%         95% 
##  58941776.5  62642842.1  66264618.4  70686064.8  76333370.2  82037209.0 
##         96%         97%         98%         99%        100% 
##  90014943.2  99504343.8 113054665.2 132338269.6 204098909.0
x <- subset(data$dmi_egreso_total_mes, data$dmi_egreso_total_mes <= 209829245.0)
hist(x)

dmi_egreso_total_mes <- data$dmi_egreso_total_mes

names(data)[63] # cp_valor_inicial: valor inicial de la obligacion por producto
## [1] "cp_valor_inicial"
summary(data$cp_valor_inicial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[64] # dh_max_dia_otros_d: Ultimo dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_max_dia_otros_d"
hist(data$dh_max_dia_otros_d)

x <- ifelse(data$dh_max_dia_otros_d <= 15, 1, 0)
table(x)
## x
##     0     1 
##  7892 10154
flax_maxDiaOtrosD_1quincena <- x

names(data)[65] # p_cuota_sobre_saldo: Valor de la cuota sobre saldo por producto
## [1] "cp_cuota_sobre_saldo"
summary(data$cp_cuota_sobre_saldo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[66] # dc_sum_valor_inicial: Suma valores iniciales de obligaciones en todo el sistema financiero
## [1] "dc_sum_valor_inicial"
summary(data$dc_sum_valor_inicial)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0         0    602639         0 498786000
hist(data$dc_sum_valor_inicial)

names(data)[67] # dh_cant_pago_tarj_d: Cantidad de trasacciones de salida por concepto de pago de tarjeta de credito
## [1] "dh_cant_pago_tarj_d"
summary(data$dh_cant_pago_tarj_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.8238  1.0000 33.0000
hist(data$dh_cant_pago_tarj_d)

names(data)[68] # dh_max_dia_pagos_d: Ultimo dia en el que realizó algún credito en el mes anterior
## [1] "dh_max_dia_pagos_d"
hist(data$dh_max_dia_pagos_d)

x <- ifelse(data$dh_max_dia_pagos_d <= 15, 1, 0)
table(x)
## x
##    0    1 
## 9762 8284
flag_maxDiaPagos_d_1quincena <- x

names(data)[69] # cp_saldo_sobre_inicial: valor del saldo sobre inicial por producto mes anterior
## [1] "cp_saldo_sobre_inicial"
summary(data$cp_saldo_sobre_inicial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[70] # pc_mediana_nom3: Mediana de los últimos 3 pagos nómina  para cálculo de retanqueo libranza
## [1] "pc_mediana_nom3"
summary(data$pc_mediana_nom3)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0    72055        0 64345000
hist(data$pc_mediana_nom3)

names(data)[71] # cp_esta_cuota_otro: Es el estado de la cuota otro
## [1] "cp_esta_cuota_otro"
summary(data$cp_esta_cuota_otro)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[72] # dh_max_dia_retiros_d: Ultimo dia en el que realizó reitro en el mes anterior
## [1] "dh_max_dia_retiros_d"
hist(data$dh_max_dia_retiros_d)

# variable incorrecta, el cuartil 3 es 30 y un mes tiene hasta 31 días

names(data)[73] # dh_avg_dia_entradas: Dia promedio del mes en el recibe las entradas de dinero
## [1] "dh_avg_dia_entradas"
summary(data$dh_avg_dia_entradas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   4.161   3.892   6.125  31.000
hist(data$dh_avg_dia_entradas)

names(data)[74] # dh_avg_dia_pagos_d: Dia del mes promedio en el que hace las salidas de dinero por pagos
## [1] "dh_avg_dia_pagos_d"
x <- ifelse(data$dh_avg_dia_pagos_d <= 10, 1, 0)
table(x)
## x
##     0     1 
##    45 18001
names(data)[75] # dh_val_salidas: Valor total de las trasacciones de salida en un mes
## [1] "dh_val_salidas"
summary(data$dh_val_salidas)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   5884591  19619223  28254748 204098909
hist(data$dh_val_salidas)

quantile(data$dh_val_salidas, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##          6%          7%          8%          9%         10%         11% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         12%         13%         14%         15%         16%         17% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         18%         19%         20%         21%         22%         23% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         24%         25%         26%         27%         28%         29% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         30%         31%         32%         33%         34%         35% 
##         0.0         0.0         0.0         0.0     11359.0     12350.0 
##         36%         37%         38%         39%         40%         41% 
##     76400.8    342377.0    679878.3   1118683.2   1541005.0   1938851.1 
##         42%         43%         44%         45%         46%         47% 
##   2298733.0   2740303.9   3183321.0   3592226.5   4022760.0   4466078.0 
##         48%         49%         50%         51%         52%         53% 
##   4917904.6   5332810.0   5884591.0   6356795.0   6873811.8   7663007.0 
##         54%         55%         56%         57%         58%         59% 
##   8257288.2   9064578.0   9809325.0  10475107.3  11023429.0  11765300.4 
##         60%         61%         62%         63%         64%         65% 
##  12624035.0  13404973.0  14242719.6  15034490.1  15802825.6  16946976.0 
##         66%         67%         68%         69%         70%         71% 
##  17905644.0  18764929.0  19700935.4  20697842.0  21641359.5  23028731.0 
##         72%         73%         74%         75%         76%         77% 
##  24249542.8  25385925.3  26679698.7  28254747.5  29644945.6  30871852.0 
##         78%         79%         80%         81%         82%         83% 
##  32539082.0  34463859.1  36134965.0  37813640.0  39947167.0  41741354.8 
##         84%         85%         86%         87%         88%         89% 
##  43999606.0  46745102.2  48942396.4  50977400.0  52974824.8  55714561.0 
##         90%         91%         92%         93%         94%         95% 
##  59028184.5  62668751.4  66304736.6  70688799.0  76451589.0  82043497.0 
##         96%         97%         98%         99%        100% 
##  90022669.6  99748689.3 113054665.2 132338269.6 204098909.0
x <- subset(data$dh_val_salidas, data$dh_val_salidas <= 217011100)
hist(x)

dh_val_salidas <- data$dh_val_salidas

names(data)[76] # dc_sum_valor_cuota: Valor total de las trasacciones de salida en un mes
## [1] "dc_sum_valor_cuota"
summary(data$dc_sum_valor_cuota)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0    26756        0 10504000
hist(data$dc_sum_valor_cuota)

names(data)[77] # dh_min_dia_tras_d: Primer dia en el que realizó pago de traslado en el mes anterior
## [1] "dh_min_dia_tras_d"
hist(data$dh_min_dia_tras_d)

x <- ifelse(data$dh_min_dia_tras_d <= 10, 1, 0)
table(x)
## x
##     0     1 
##  1640 16406
names(data)[78] # cp_porc_valorcuot_ing: Relación entre el valor de la cuota sobre los ingresos por producto
## [1] "cp_porc_valorcuot_ing"
summary(data$cp_porc_valorcuot_ing)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[79] # pc_ind_ajustado: Ingreso neto disponible  del cliente ajustado
## [1] "pc_ind_ajustado"
summary(data$pc_ind_ajustado)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -6478753   319157  3173260  6474900  8958133 59465796
hist(data$pc_ind_ajustado)

quantile(data$pc_ind_ajustado, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
## -6478752.60 -1584635.43  -736622.04  -363284.48  -136329.26   -28157.31 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         18%         19%         20%         21%         22%         23% 
##        0.00        0.00        0.00    10854.35    95242.50   166275.00 
##         24%         25%         26%         27%         28%         29% 
##   229297.05   319157.46   414630.00   515272.66   656092.98   773018.33 
##         30%         31%         32%         33%         34%         35% 
##   851723.71   955501.35  1106847.00  1197787.50  1316281.58  1374374.25 
##         36%         37%         38%         39%         40%         41% 
##  1463705.35  1558277.54  1644887.26  1739666.44  1860434.10  1979169.46 
##         42%         43%         44%         45%         46%         47% 
##  2082131.05  2210792.04  2333189.63  2474835.75  2578853.57  2661750.00 
##         48%         49%         50%         51%         52%         53% 
##  2819938.87  2966919.38  3173260.41  3314359.51  3431853.07  3568431.40 
##         54%         55%         56%         57%         58%         59% 
##  3786186.59  4016250.00  4159682.15  4360409.42  4488738.16  4630921.05 
##         60%         61%         62%         63%         64%         65% 
##  4760138.69  4974054.87  5297034.28  5504701.54  5846234.27  6117532.40 
##         66%         67%         68%         69%         70%         71% 
##  6380682.46  6673624.40  6940216.30  7237858.50  7635737.48  7844523.75 
##         72%         73%         74%         75%         76%         77% 
##  8066551.78  8352009.55  8678789.30  8958132.53  9306000.90  9765300.34 
##         78%         79%         80%         81%         82%         83% 
## 10257365.98 10770282.61 11156934.11 11599371.56 12020464.91 12773224.12 
##         84%         85%         86%         87%         88%         89% 
## 13320885.71 13937072.75 14681655.47 15506136.16 16123060.47 17059105.95 
##         90%         91%         92%         93%         94%         95% 
## 17978599.46 19117837.97 20277254.42 22345186.29 24072384.41 25778069.55 
##         96%         97%         98%         99%        100% 
## 28130216.60 30465696.15 33894017.04 38331857.83 59465796.00
x <- subset(data$pc_ind_ajustado, data$pc_ind_ajustado <= 68107989.8)
hist(x)

names(data)[80] # dh_val_retiros_d: Valor total de las trasacciones de salida por concepto de retiros en un mes
## [1] "dh_val_retiros_d"
summary(data$dh_val_retiros_d)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   1235198  11122177  13014757 197776495
hist(data$dh_val_retiros_d)

names(data)[81] # pc_tiem_lt_prod_abie_total: Tiempo último producto abierto
## [1] "pc_tiem_lt_prod_abie_total"
summary(data$pc_tiem_lt_prod_abie_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0778  0.0000 58.0000
hist(data$pc_tiem_lt_prod_abie_total)

names(data)[82] # marca_info_cifin_decode: Marca Cifin(Consultado, no consultado, no encontrado, etc)
## [1] "marca_info_cifin_decode"
table(data$marca_info_cifin_decode)
## 
##     0     1     2 
## 12258   252  5536
x <- ifelse(data$marca_info_cifin_decode == 0, 1, 0)
table(x)
## x
##     0     1 
##  5788 12258
flag_encontrado_cifin <- x

names(data)[83] # dh_max_dia_pago_tarj_d: Ultimo dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_max_dia_pago_tarj_d"
hist(data$dh_max_dia_pago_tarj_d)

names(data)[84] # dc_valobli_ing: Suma del valor inicial de las obligaciones en el sistema financiero sobre los ingresos
## [1] "dc_valobli_ing"
summary(data$dc_valobli_ing)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.05584  0.00000 58.56358
hist(data$dc_valobli_ing)

names(data)[85] # pc_cantidad_tdc_entidad: Cantidad tarjetas de crédito en el banco
## [1] "pc_cantidad_tdc_entidad"
table(data$pc_cantidad_tdc_entidad)
## 
##     0     1     2     3     4 
## 17833    93    56    13    51
names(data)[86] # dh_min_dia_otros_d: Primer dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_min_dia_otros_d"
hist(data$dh_min_dia_otros_d)

x <- ifelse(data$dh_min_dia_otros_d <= 15, 1, 0)
table(x)
## x
##     0     1 
##  1976 16070
names(data)[87] # dc_cant_obligaciones: Cantidad de obligaciones
## [1] "dc_cant_obligaciones"
table(data$dc_cant_obligaciones)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11    13 
## 17809    18    10    32    22    23    23    10    62     8     2    11     1 
##    14    15    17    18    20 
##     1     4     3     6     1
summary(data$dc_cant_obligaciones)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.08423  0.00000 20.00000
hist(data$dc_cant_obligaciones)

x <- ifelse(data$dc_cant_obligaciones >= 1, 1, 0)
table(x)
## x
##     0     1 
## 17809   237
names(data)[88] # cpc_sum_nro_cuota: Suma de los número de cuotas de todas las obligaciones del cliente
## [1] "cpc_sum_nro_cuota"
summary(data$cpc_sum_nro_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   20.00   78.43  107.00 1108.00
names(data)[89] # cpc_avg_nro_cuota: Promedio del número de cuotas entre todos los productos del cliente
## [1] "cpc_avg_nro_cuota"
summary(data$cpc_avg_nro_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   14.00   34.64   51.00  446.00
hist(data$cpc_avg_nro_cuota)

cpc_avg_nro_cuota <- data$cpc_avg_nro_cuota

names(data)[90] # cpc_max_nro_cuota: Número de cuotas máximo de un cliente entre todos sus productos
## [1] "cpc_max_nro_cuota"
names(data)[91] # cp_saldo: Valor del saldo del mes anterior por producto
## [1] "cp_saldo"
summary(data$cp_saldo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[92] # cp_cuota_sobre_inicial: valor de la cuota sobre el valor desembolsado incial por producto
## [1] "cp_cuota_sobre_inicial"
summary(data$cp_cuota_sobre_inicial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[93] # cpc_sum_saldo: Suma del saldo de todos los productos activos del cliente
## [1] "cpc_sum_saldo"
summary(data$cpc_sum_saldo)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  -1129268         0   1108448  11329613   8989011 220890667
hist(data$cpc_sum_saldo)

quantile(data$cpc_sum_saldo, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
##  -1129268.0         0.0         0.0         0.0         0.0         0.0 
##          6%          7%          8%          9%         10%         11% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         12%         13%         14%         15%         16%         17% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         18%         19%         20%         21%         22%         23% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         24%         25%         26%         27%         28%         29% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         30%         31%         32%         33%         34%         35% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         36%         37%         38%         39%         40%         41% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         42%         43%         44%         45%         46%         47% 
##         0.0         0.0         0.0     33950.0    110565.4    348394.0 
##         48%         49%         50%         51%         52%         53% 
##    613901.0    900660.1   1108447.5   1439348.3   1743853.2   2009279.1 
##         54%         55%         56%         57%         58%         59% 
##   2230500.0   2464656.0   2738263.0   3022119.5   3254198.7   3529597.0 
##         60%         61%         62%         63%         64%         65% 
##   3829456.0   4079343.0   4345954.0   4611392.3   4843450.0   5057907.0 
##         66%         67%         68%         69%         70%         71% 
##   5295887.0   5527366.0   5700842.4   5994608.2   6311746.0   6793297.0 
##         72%         73%         74%         75%         76%         77% 
##   7411579.2   7832990.0   8379714.0   8989011.2   9577423.2  10105086.2 
##         78%         79%         80%         81%         82%         83% 
##  10746466.0  11413477.4  12158710.0  12852364.0  14071157.1  14697477.0 
##         84%         85%         86%         87%         88%         89% 
##  15465740.8  16480607.0  17754309.9  19526399.3  21521641.0  23601753.0 
##         90%         91%         92%         93%         94%         95% 
##  27460463.5  32852491.0  39550370.6  45636780.7  56445784.0  68892210.8 
##         96%         97%         98%         99%        100% 
##  79424456.4  96412554.3 116216129.4 145505224.0 220890667.4
x <- subset(data$cpc_sum_saldo, data$cpc_sum_saldo <= 220905770.00)
hist(x)

cpc_sum_saldo <- data$cpc_sum_saldo

names(data)[94] # cp_porc_saldo_ing: valor del saldo sobre ingreso por producto
## [1] "cp_porc_saldo_ing"
summary(data$cp_porc_saldo_ing)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[95] # gsm_mejor_gestion: Mejor gestión realizada en el mes anterior
## [1] "gsm_mejor_gestion"
table(data$gsm_mejor_gestion)
## 
##     0    16 
## 18044     2
names(data)[96] # dh_min_dia_nomina_c
## [1] "dh_min_dia_nomina_c"
names(data)[97] # dh_max_dia_nomina_c: Ultimo dia en el que recibió pago de nomina en el mes anterior
## [1] "dh_max_dia_nomina_c"
hist(data$dh_max_dia_nomina_c)

names(data)[98] # cp_valor_cuota: Valor de cuota por producto
## [1] "cp_valor_cuota"
summary(data$cp_valor_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[99] # cpc_nro_cuota_tdc: Suma de los número de cuotas de todas las tarjetas de crédito del cliente
## [1] "cpc_nro_cuota_tdc"
names(data)[100] # gsm_prom_dias_gest: Promedio de los días en que se realizaron gestiones en el mes anterior
## [1] "gsm_prom_dias_gest"
names(data)[101] # pc_cuota_no_rot_ent: Cuota de productos no rotativos en el banco
## [1] "pc_cuota_no_rot_ent"
summary(data$pc_cuota_no_rot_ent)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   23699       0 6342000
hist(data$pc_cuota_no_rot_ent)

names(data)[102] # dh_val_nomina_c: Valor total de las trasacciones de entrada por concepto de nomina en un mes
## [1] "dh_val_nomina_c"
summary(data$dh_val_nomina_c)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0   101166        0 42000000
hist(data$dh_val_nomina_c)

quantile(data$dh_val_nomina_c, seq(0,1,by=0.01))
##       0%       1%       2%       3%       4%       5%       6%       7% 
##        0        0        0        0        0        0        0        0 
##       8%       9%      10%      11%      12%      13%      14%      15% 
##        0        0        0        0        0        0        0        0 
##      16%      17%      18%      19%      20%      21%      22%      23% 
##        0        0        0        0        0        0        0        0 
##      24%      25%      26%      27%      28%      29%      30%      31% 
##        0        0        0        0        0        0        0        0 
##      32%      33%      34%      35%      36%      37%      38%      39% 
##        0        0        0        0        0        0        0        0 
##      40%      41%      42%      43%      44%      45%      46%      47% 
##        0        0        0        0        0        0        0        0 
##      48%      49%      50%      51%      52%      53%      54%      55% 
##        0        0        0        0        0        0        0        0 
##      56%      57%      58%      59%      60%      61%      62%      63% 
##        0        0        0        0        0        0        0        0 
##      64%      65%      66%      67%      68%      69%      70%      71% 
##        0        0        0        0        0        0        0        0 
##      72%      73%      74%      75%      76%      77%      78%      79% 
##        0        0        0        0        0        0        0        0 
##      80%      81%      82%      83%      84%      85%      86%      87% 
##        0        0        0        0        0        0        0        0 
##      88%      89%      90%      91%      92%      93%      94%      95% 
##        0        0        0        0        0        0        0        0 
##      96%      97%      98%      99%     100% 
##        0        0   600000  2173865 42000000
names(data)[103] # banca_completa: Si el cliente pertenece al segmento banca completa o no
## [1] "banca_completa"
table(data$banca_completa)
## 
##     0 
## 18046
names(data)[104] # cpc_saldo_sobre_ing: saldo sobre ingreso por cliente
## [1] "cpc_saldo_sobre_ing"
summary(data$cpc_saldo_sobre_ing)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.07529  0.00000  0.03420  0.70105  0.50141 20.23908
hist(data$cpc_saldo_sobre_ing)

quantile(data$cpc_saldo_sobre_ing, seq(0,1,by=0.01))
##           0%           1%           2%           3%           4%           5% 
## -0.075284533  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##           6%           7%           8%           9%          10%          11% 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##          12%          13%          14%          15%          16%          17% 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##          18%          19%          20%          21%          22%          23% 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##          24%          25%          26%          27%          28%          29% 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##          30%          31%          32%          33%          34%          35% 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##          36%          37%          38%          39%          40%          41% 
##  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000  0.000000000 
##          42%          43%          44%          45%          46%          47% 
##  0.000000000  0.000000000  0.000000000  0.001327638  0.003920323  0.010485818 
##          48%          49%          50%          51%          52%          53% 
##  0.017391611  0.026439748  0.034204096  0.042727976  0.053022024  0.064025658 
##          54%          55%          56%          57%          58%          59% 
##  0.072661546  0.082753374  0.095978925  0.109599685  0.119360811  0.133522179 
##          60%          61%          62%          63%          64%          65% 
##  0.145511888  0.153764215  0.165604222  0.180229507  0.195539646  0.214586774 
##          66%          67%          68%          69%          70%          71% 
##  0.239741468  0.260049473  0.280311386  0.297297228  0.321735771  0.349650555 
##          72%          73%          74%          75%          76%          77% 
##  0.378458794  0.413185699  0.459745553  0.501406376  0.534302699  0.570659962 
##          78%          79%          80%          81%          82%          83% 
##  0.614790560  0.670961859  0.723906664  0.810553439  0.862300980  0.949874583 
##          84%          85%          86%          87%          88%          89% 
##  1.034139144  1.126322423  1.224357805  1.353146272  1.492494873  1.647897690 
##          90%          91%          92%          93%          94%          95% 
##  1.813319044  1.978121143  2.180685275  2.429309111  2.848231115  3.195820189 
##          96%          97%          98%          99%         100% 
##  4.530609759  6.130061812  7.594766901  9.837172114 20.239077899
x <- subset(data$cpc_saldo_sobre_ing, data$cpc_saldo_sobre_ing <= 20.307294513)
hist(x)

cpc_saldo_sobre_ing <- data$cpc_saldo_sobre_ing

names(data)[105] # dh_min_dia_pago_cred_d: Primer dia en el que realizó pago de credito en el mes anterior
## [1] "dh_min_dia_pago_cred_d"
hist(data$dh_min_dia_pago_cred_d)

dh_min_dia_pago_cred_d <- data$dh_min_dia_pago_cred_d

names(data)[106] # cpc_saldo_tdc: Saldo total en tarjetas de crédito del cliente
## [1] "cpc_saldo_tdc"
summary(data$cpc_saldo_tdc)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1129268        0   318800  4323266  5989105 42938793
hist(data$cpc_saldo_tdc)

quantile(data$cpc_saldo_tdc, seq(0,1,by=0.01))
##         0%         1%         2%         3%         4%         5%         6% 
## -1129268.0      -45.1        0.0        0.0        0.0        0.0        0.0 
##         7%         8%         9%        10%        11%        12%        13% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        14%        15%        16%        17%        18%        19%        20% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        21%        22%        23%        24%        25%        26%        27% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        28%        29%        30%        31%        32%        33%        34% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        35%        36%        37%        38%        39%        40%        41% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        42%        43%        44%        45%        46%        47%        48% 
##        0.0        0.0        0.0        0.0        0.0     4536.0    39499.0 
##        49%        50%        51%        52%        53%        54%        55% 
##   131904.3   318800.0   558948.6   796776.6  1028169.0  1249210.2  1555273.0 
##        56%        57%        58%        59%        60%        61%        62% 
##  1809553.0  2020438.0  2214443.8  2427540.6  2635538.0  2846335.5  3087026.0 
##        63%        64%        65%        66%        67%        68%        69% 
##  3329693.5  3592291.4  3829456.0  4031553.2  4257540.0  4491031.6  4722643.2 
##        70%        71%        72%        73%        74%        75%        76% 
##  4932761.5  5111416.0  5366764.6  5540825.0  5735475.3  5989105.0  6209213.4 
##        77%        78%        79%        80%        81%        82%        83% 
##  6643593.0  7185642.5  7609331.2  8034478.0  8497649.0  9048885.3  9573365.3 
##        84%        85%        86%        87%        88%        89%        90% 
## 10016976.0 10596398.2 11109566.0 11723089.0 12303712.2 13004694.4 14105014.5 
##        91%        92%        93%        94%        95%        96%        97% 
## 14697477.0 15541141.0 16460852.1 17506625.0 19008508.5 20708956.0 22249488.6 
##        98%        99%       100% 
## 25956592.3 30683139.0 42938793.0
x <- subset(data$cpc_saldo_tdc, data$cpc_saldo_tdc <= 43186883.96)
hist(x)

cpc_saldo_tdc <- data$cpc_saldo_tdc

names(data)[107] # pc_cuota_de_consumo: Cuota de crédito  de consumo reportada por CIFIN
## [1] "pc_cuota_de_consumo"
summary(data$pc_cuota_de_consumo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   22909       0 6342000
hist(data$pc_cuota_de_consumo)

names(data)[108] # dh_cant_tras_d: Cantidad de traslados de salida de dinero en un mes
## [1] "dh_cant_tras_d"
hist(data$dh_cant_tras_d)

quantile(data$dh_cant_tras_d, seq(0,1,by=0.005))
##   0.0%   0.5%   1.0%   1.5%   2.0%   2.5%   3.0%   3.5%   4.0%   4.5%   5.0% 
##      0      0      0      0      0      0      0      0      0      0      0 
##   5.5%   6.0%   6.5%   7.0%   7.5%   8.0%   8.5%   9.0%   9.5%  10.0%  10.5% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  11.0%  11.5%  12.0%  12.5%  13.0%  13.5%  14.0%  14.5%  15.0%  15.5%  16.0% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  16.5%  17.0%  17.5%  18.0%  18.5%  19.0%  19.5%  20.0%  20.5%  21.0%  21.5% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  22.0%  22.5%  23.0%  23.5%  24.0%  24.5%  25.0%  25.5%  26.0%  26.5%  27.0% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  27.5%  28.0%  28.5%  29.0%  29.5%  30.0%  30.5%  31.0%  31.5%  32.0%  32.5% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  33.0%  33.5%  34.0%  34.5%  35.0%  35.5%  36.0%  36.5%  37.0%  37.5%  38.0% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  38.5%  39.0%  39.5%  40.0%  40.5%  41.0%  41.5%  42.0%  42.5%  43.0%  43.5% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  44.0%  44.5%  45.0%  45.5%  46.0%  46.5%  47.0%  47.5%  48.0%  48.5%  49.0% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  49.5%  50.0%  50.5%  51.0%  51.5%  52.0%  52.5%  53.0%  53.5%  54.0%  54.5% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  55.0%  55.5%  56.0%  56.5%  57.0%  57.5%  58.0%  58.5%  59.0%  59.5%  60.0% 
##      0      0      0      0      0      0      0      0      0      0      0 
##  60.5%  61.0%  61.5%  62.0%  62.5%  63.0%  63.5%  64.0%  64.5%  65.0%  65.5% 
##      0      0      0      0      0      0      0      0      0      0      1 
##  66.0%  66.5%  67.0%  67.5%  68.0%  68.5%  69.0%  69.5%  70.0%  70.5%  71.0% 
##      1      1      1      1      1      1      1      1      1      1      1 
##  71.5%  72.0%  72.5%  73.0%  73.5%  74.0%  74.5%  75.0%  75.5%  76.0%  76.5% 
##      1      1      2      2      2      2      2      2      2      2      2 
##  77.0%  77.5%  78.0%  78.5%  79.0%  79.5%  80.0%  80.5%  81.0%  81.5%  82.0% 
##      3      3      3      3      3      3      3      4      4      4      4 
##  82.5%  83.0%  83.5%  84.0%  84.5%  85.0%  85.5%  86.0%  86.5%  87.0%  87.5% 
##      4      4      5      5      5      5      6      6      6      6      7 
##  88.0%  88.5%  89.0%  89.5%  90.0%  90.5%  91.0%  91.5%  92.0%  92.5%  93.0% 
##      7      8      8      8      9      9      9     10     10     11     11 
##  93.5%  94.0%  94.5%  95.0%  95.5%  96.0%  96.5%  97.0%  97.5%  98.0%  98.5% 
##     12     12     13     14     14     15     16     17     18     19     20 
##  99.0%  99.5% 100.0% 
##     23     25     31
x <- subset(data$dh_cant_tras_d, data$dh_cant_tras_d <= 31)
hist(x)

dh_cant_tras_d <- data$dh_cant_tras_d

names(data)[109] # dh_max_dia_comisio_d
## [1] "dh_max_dia_comisio_d"
names(data)[110] # cpc_avg_saldo: Promedio del saldo de las obligaciones del cliente en el mes anterior
## [1] "cpc_avg_saldo"
summary(data$cpc_avg_saldo)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  -1011469         0    774422   5152861   4793250 106882802
hist(data$cpc_avg_saldo)

quantile(data$cpc_avg_saldo, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
##  -1011469.0         0.0         0.0         0.0         0.0         0.0 
##          6%          7%          8%          9%         10%         11% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         12%         13%         14%         15%         16%         17% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         18%         19%         20%         21%         22%         23% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         24%         25%         26%         27%         28%         29% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         30%         31%         32%         33%         34%         35% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         36%         37%         38%         39%         40%         41% 
##         0.0         0.0         0.0         0.0         0.0         0.0 
##         42%         43%         44%         45%         46%         47% 
##         0.0         0.0         0.0     16975.0     69850.0    206899.0 
##         48%         49%         50%         51%         52%         53% 
##    365970.0    565479.8    774421.5    940160.4   1053110.0   1202192.1 
##         54%         55%         56%         57%         58%         59% 
##   1358774.5   1517750.0   1668502.5   1818995.1   1946590.0   2064736.0 
##         60%         61%         62%         63%         64%         65% 
##   2199797.0   2328455.4   2429205.4   2528953.5   2676266.0   2793238.0 
##         66%         67%         68%         69%         70%         71% 
##   2973823.1   3124962.0   3305366.2   3512956.1   3720362.0   3908560.0 
##         72%         73%         74%         75%         76%         77% 
##   4146008.0   4315487.0   4534635.6   4793250.0   4927245.0   5105438.0 
##         78%         79%         80%         81%         82%         83% 
##   5327563.0   5540825.0   5755606.8   6023367.8   6305529.5   6769565.3 
##         84%         85%         86%         87%         88%         89% 
##   7375373.0   7884712.2   8498791.6   9327747.6  10567824.0  11633631.6 
##         90%         91%         92%         93%         94%         95% 
##  13229350.7  15008544.5  17404416.0  19849268.4  21885903.6  25137925.4 
##         96%         97%         98%         99%        100% 
##  33473766.1  40065113.3  47628017.1  66073881.0 106882801.8
x <- subset(data$cpc_avg_saldo, data$cpc_avg_saldo <= 106884393.50)
hist(x)

cpc_avg_saldo <- data$cpc_avg_saldo

names(data)[111] # dc_max_saldo_sf: máximo saldo en el sistema financiero (Solo Financiero)
## [1] "dc_max_saldo_sf"
summary(data$dc_max_saldo_sf)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0         0    259660         0 182310000
names(data)[112] # dh_val_pago_tarj_d: Valor total de las trasacciones de salida por concepto de pagos de tdc en un mes
## [1] "dh_val_pago_tarj_d"
summary(data$dh_val_pago_tarj_d)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0   427965   310071 10559486
hist(data$dh_val_pago_tarj_d)

quantile(data$dh_val_pago_tarj_d, seq(0,1,by=0.01))
##         0%         1%         2%         3%         4%         5%         6% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##         7%         8%         9%        10%        11%        12%        13% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        14%        15%        16%        17%        18%        19%        20% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        21%        22%        23%        24%        25%        26%        27% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        28%        29%        30%        31%        32%        33%        34% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        35%        36%        37%        38%        39%        40%        41% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        42%        43%        44%        45%        46%        47%        48% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        49%        50%        51%        52%        53%        54%        55% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        56%        57%        58%        59%        60%        61%        62% 
##        0.0        0.0        0.0        0.0        0.0        0.0        0.0 
##        63%        64%        65%        66%        67%        68%        69% 
##        0.0        0.0        0.0     6823.0    20000.0    34674.4    61262.1 
##        70%        71%        72%        73%        74%        75%        76% 
##   113870.0   154310.6   190467.0   230752.0   268506.0   310071.0   353957.8 
##        77%        78%        79%        80%        81%        82%        83% 
##   401450.0   440004.8   485001.7   549448.0   611841.9   681124.1   720438.2 
##        84%        85%        86%        87%        88%        89%        90% 
##   795860.0   864850.0   946432.5  1017867.0  1122199.0  1239723.7  1383143.5 
##        91%        92%        93%        94%        95%        96%        97% 
##  1511667.0  1671642.8  1883813.3  2088449.4  2368354.8  2818150.4  3187581.8 
##        98%        99%       100% 
##  3910841.7  5267074.4 10559486.0
x <- subset(data$dh_val_pago_tarj_d, data$dh_val_pago_tarj_d <= 10560315.0)
hist(x)

dh_val_pago_tarj_d <- data$dh_val_pago_tarj_d

names(data)[113] # pc_productos_no_rotativos_entidad: Cantidad de productos no rotativos en el banco
## [1] "pc_productos_no_rotativos_entidad"
table(data$pc_productos_no_rotativos_entidad) 
## 
##     0     1     2     3     4     5     6     9    18 
## 17850    54    38    20    36     4    10     1    33
hist(data$pc_productos_no_rotativos_entidad)

names(data)[114] # pc_saldo_no_rot_ent: Obligaciones no rotativos en el banco
## [1] "pc_saldo_no_rot_ent"
summary(data$pc_saldo_no_rot_ent)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0         0    918154         0 229052000
names(data)[115] # pc_vi_no_rotativos_entidad: Obligaciones no rotativos en el banco
## [1] "pc_vi_no_rotativos_entidad"
summary(data$pc_vi_no_rotativos_entidad)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    1033       0  233279
names(data)[116] # dh_min_dia_entradas
## [1] "dh_min_dia_entradas"
names(data)[117] # pcons_hipotecario_vivienda: Es el producto un hipotecario
## [1] "pcons_hipotecario_vivienda"
table(data$pcons_hipotecario_vivienda)
## 
##     0     1 
## 17500   546
pcons_hipotecario_vivienda <- data$pcons_hipotecario_vivienda

names(data)[118] # gsm_mejor_gestion_3m: Mejor gestión realizada en los últimos 3 meses
## [1] "gsm_mejor_gestion_3m"
table(data$gsm_mejor_gestion_3m)
## 
##     0    16 
## 18044     2
names(data)[119] # dh_avg_dia_salidas: Dia promedio del mes en el que hace las salidas de dinero
## [1] "dh_avg_dia_salidas"
names(data)[120] # y_auto_cura: Variable respuesta: si el cliente se autocuro o no
## [1] "y_auto_cura"
table(data$y_auto_cura)
## 
##     0     1 
##  5590 12456
Y <- data$y_auto_cura

names(data)[122] # segmentoestructural: Segmento  estructural
## [1] "segmentoestructural"
table(data$segmentoestructural)
## 
##  PYME 
## 18046
names(data)[123] # subsegmentoestructural
## [1] "subsegmentoestructural"
table(data$subsegmentoestructural)
## 
## Pyme Pequena 
##        18046

Unificación de variables

unificado <- as.data.frame(
  cbind(
    Y,
    max_sem,
    desv_sem,
    prom_bim,
    max_mes_anterior,
    prom_mes_anterior,
    prom_sem,
    max_bim,
    prom_trim,
    pc_cant_moras_30_ult_12_meses,
    desv_trim,
    desv_bim,
    dh_cant_entradas,
    pc_transaccional,
    dh_val_entradas,
    pcons_tarjeta_de_credito,
    dh_avg_dia_retiros_d,
    dmi_max_egreso_diario,
    dh_cant_otros_d,
    dmi_max_ingreso_diario,
    dh_val_otros_d,
    pc_ingreso_final,
    dh_cant_pagos_d,
    dmi_ingreso_total_mes,
    dh_val_pagos_d,
    pc_gasto_familiar,
    pc_cuotas_pagadas,
    pc_ingreso_rutina_con_techo,
    dh_cant_salidas,
    dh_min_dia_pagos_d,
    pc_ingreso_por_rutina,
    dmi_egreso_total_mes,
    dh_val_salidas,
    cpc_avg_nro_cuota,
    cpc_sum_saldo,
    cpc_saldo_sobre_ing,
    dh_min_dia_pago_cred_d,
    cpc_saldo_tdc,
    dh_cant_tras_d,
    cpc_avg_saldo,
    dh_val_pago_tarj_d,
    pcons_hipotecario_vivienda,
    flag_mora60_ult12meses,
    flag_ultima_entrada_1quincena,
    flag_tuvo_mora60_ult3meses,
    flag_diaPago_1quincena,
    flag_es_cluster_6,
    flag_tuvo_mora90_ult12M,
    flax_maxDiaOtrosD_1quincena,
    flag_maxDiaPagos_d_1quincena,
    flag_encontrado_cifin
  )
)

dim(unificado)
## [1] 18046    51
str(unificado)
## 'data.frame':    18046 obs. of  51 variables:
##  $ Y                            : num  0 0 0 0 0 0 1 1 1 1 ...
##  $ max_sem                      : num  7 17 36 27 32 32 0 0 0 12 ...
##  $ desv_sem                     : num  2.86 7.64 12.87 10.89 11.65 ...
##  $ prom_bim                     : num  1.87 3.46 3.67 3.67 3.16 ...
##  $ max_mes_anterior             : num  7 17 27 27 20 27 0 0 0 12 ...
##  $ prom_mes_anterior            : num  2.65 4.12 5.2 5.2 4.47 ...
##  $ prom_sem                     : num  1.17 8 21 4.83 20.83 ...
##  $ max_bim                      : num  7 17 27 27 20 30 0 0 0 12 ...
##  $ prom_trim                    : num  1.53 3.7 3.74 3 4.16 ...
##  $ pc_cant_moras_30_ult_12_meses: num  1 1 0 1 0 0 0 0 0 0 ...
##  $ desv_trim                    : num  2.01 2.4 3.68 3.95 4.02 ...
##  $ desv_bim                     : num  2.22 2.66 4.37 4.37 3.76 ...
##  $ dh_cant_entradas             : num  0 3.74 2.24 0 1.41 ...
##  $ pc_transaccional             : num  3000000 7710300 6670935 3000000 3150000 ...
##  $ dh_val_entradas              : num  0 2110 1785 0 1000 ...
##  $ pcons_tarjeta_de_credito     : num  0 0 0 0 0 0 1 0 0 1 ...
##  $ dh_avg_dia_retiros_d         : num  0 16.6 22 0 16.2 ...
##  $ dmi_max_egreso_diario        : num  0 2061032 2131531 0 1002318 ...
##  $ dh_cant_otros_d              : num  0 1.41 1.41 0 0 ...
##  $ dmi_max_ingreso_diario       : num  0 2210300 1858600 0 1000000 ...
##  $ dh_val_otros_d               : num  0 22651 20200 0 0 ...
##  $ pc_ingreso_final             : num  3000000 6553755 6670935 3000000 2677500 ...
##  $ dh_cant_pagos_d              : num  0 8 1 0 4 1 0 10 13 0 ...
##  $ dmi_ingreso_total_mes        : num  0 4452485 3187600 0 1000000 ...
##  $ dh_val_pagos_d               : num  0 3085738 1 0 1978774 ...
##  $ pc_gasto_familiar            : num  765000 2313090 1701088 765000 945000 ...
##  $ pc_cuotas_pagadas            : num  0 0 0 0 0 ...
##  $ pc_ingreso_rutina_con_techo  : num  3000000 7710300 6670935 3000000 3150000 ...
##  $ dh_cant_salidas              : num  0 28 11 0 10 11 0 72 69 0 ...
##  $ dh_min_dia_pagos_d           : num  0 13 7 0 2 7 0 9 8 0 ...
##  $ pc_ingreso_por_rutina        : num  3000000 7710300 6670935 3000000 3150000 ...
##  $ dmi_egreso_total_mes         : num  0 4921015 3055581 0 1998986 ...
##  $ dh_val_salidas               : num  0 4921015 3055581 0 1998986 ...
##  $ cpc_avg_nro_cuota            : num  0 0 0 0 0 0 0 64 70 0 ...
##  $ cpc_sum_saldo                : num  0 0 0 0 0 ...
##  $ cpc_saldo_sobre_ing          : num  0 0 0 0 0 ...
##  $ dh_min_dia_pago_cred_d       : num  0 13 7 0 2 7 0 17 15 0 ...
##  $ cpc_saldo_tdc                : num  0 0 0 0 0 ...
##  $ dh_cant_tras_d               : num  0 0 0 0 0 0 0 4 2 0 ...
##  $ cpc_avg_saldo                : num  0 0 0 0 0 ...
##  $ dh_val_pago_tarj_d           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pcons_hipotecario_vivienda   : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ flag_mora60_ult12meses       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flag_ultima_entrada_1quincena: num  1 0 0 1 0 0 1 0 0 1 ...
##  $ flag_tuvo_mora60_ult3meses   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flag_diaPago_1quincena       : num  1 1 1 0 0 0 1 0 0 0 ...
##  $ flag_es_cluster_6            : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ flag_tuvo_mora90_ult12M      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flax_maxDiaOtrosD_1quincena  : num  1 0 0 1 1 0 1 0 0 1 ...
##  $ flag_maxDiaPagos_d_1quincena : num  1 0 1 1 0 1 1 0 0 1 ...
##  $ flag_encontrado_cifin        : num  0 1 1 0 1 1 0 1 1 0 ...

Selección de variables

Se utilizará la función nearZeroVar para eliminar variables con varianza cercana a 0 y la función findCorrelation para eliminar variables independientes con correlación >= x entre sí.

Y <- unificado$Y
unificado$Y <- NULL
X <- unificado

# nearZeroVar
i <- nearZeroVar(X)
# variables a omitir por no tener varianza
names(X)[i]
## [1] "dh_cant_salidas"            "cpc_avg_nro_cuota"         
## [3] "pcons_hipotecario_vivienda"
X <- X[,-i]

# FindCorrelation
i <- findCorrelation(cor(X),cutoff = 0.8)
# variables a omitir por tener alta correlación entre sí
names(X)[i]
##  [1] "dh_val_entradas"               "dh_val_salidas"               
##  [3] "dmi_egreso_total_mes"          "dmi_ingreso_total_mes"        
##  [5] "flag_maxDiaPagos_d_1quincena"  "flag_ultima_entrada_1quincena"
##  [7] "dmi_max_egreso_diario"         "pc_gasto_familiar"            
##  [9] "prom_trim"                     "pc_ingreso_rutina_con_techo"  
## [11] "pc_transaccional"              "pc_ingreso_final"             
## [13] "prom_bim"                      "pc_cant_moras_30_ult_12_meses"
## [15] "max_bim"                       "max_sem"                      
## [17] "prom_sem"                      "flag_mora60_ult12meses"       
## [19] "prom_mes_anterior"             "desv_bim"                     
## [21] "cpc_sum_saldo"
X <- X[,-i]

# variables resultantes
salida <- cbind(Y, X)
dim(salida) 
## [1] 18046    27
# Variables que quedan
names(salida)
##  [1] "Y"                           "desv_sem"                   
##  [3] "max_mes_anterior"            "desv_trim"                  
##  [5] "dh_cant_entradas"            "pcons_tarjeta_de_credito"   
##  [7] "dh_avg_dia_retiros_d"        "dh_cant_otros_d"            
##  [9] "dmi_max_ingreso_diario"      "dh_val_otros_d"             
## [11] "dh_cant_pagos_d"             "dh_val_pagos_d"             
## [13] "pc_cuotas_pagadas"           "dh_min_dia_pagos_d"         
## [15] "pc_ingreso_por_rutina"       "cpc_saldo_sobre_ing"        
## [17] "dh_min_dia_pago_cred_d"      "cpc_saldo_tdc"              
## [19] "dh_cant_tras_d"              "cpc_avg_saldo"              
## [21] "dh_val_pago_tarj_d"          "flag_tuvo_mora60_ult3meses" 
## [23] "flag_diaPago_1quincena"      "flag_es_cluster_6"          
## [25] "flag_tuvo_mora90_ult12M"     "flax_maxDiaOtrosD_1quincena"
## [27] "flag_encontrado_cifin"

Partir datasets

set.seed(1) # semilla para hacer reproducible el proceso
S <- sample(1:nrow(salida), round(nrow(salida)*0.8,0), replace = FALSE)
train <- salida[S,]
test <- salida[-S,]
# Verificación
nrow(salida) - nrow(train) - nrow(test)
## [1] 0

Scaling

Se aplicará un scaling usando punteos Z (Z = (Xi - X) / S) a través de la función preprocess del páquete CARET. Es importante destacar que los parámetros del train dataset (media y desviación) serán usados para el scaling del test y oot dataset

# aplicar scaling de punteo Z para favorecer el aprendizaje de los modelos
Y <- train$Y
train$Y <- NULL
preprocesado <- preProcess(train, scale = TRUE, center = FALSE)
train <- predict(preprocesado, train)
train <- cbind(Y, train)
summary(train)
##        Y             desv_sem       max_mes_anterior    desv_trim       
##  Min.   :0.0000   Min.   :-0.8059   Min.   :-0.5075   Min.   :-1.19411  
##  1st Qu.:0.0000   1st Qu.:-0.6508   1st Qu.:-0.5075   1st Qu.:-1.19411  
##  Median :1.0000   Median :-0.2743   Median :-0.3711   Median :-0.01921  
##  Mean   :0.6916   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.:1.0000   3rd Qu.: 0.3089   3rd Qu.: 0.1062   3rd Qu.: 0.66893  
##  Max.   :1.0000   Max.   :25.0106   Max.   :21.1784   Max.   : 8.93227  
##  dh_cant_entradas  pcons_tarjeta_de_credito dh_avg_dia_retiros_d
##  Min.   :-1.0603   Min.   :-0.6752          Min.   :-1.2211     
##  1st Qu.:-1.0603   1st Qu.:-0.6752          1st Qu.:-1.2211     
##  Median :-0.1007   Median :-0.6752          Median : 0.3910     
##  Mean   : 0.0000   Mean   : 0.0000          Mean   : 0.0000     
##  3rd Qu.: 0.7091   3rd Qu.: 1.4809          3rd Qu.: 0.7896     
##  Max.   : 3.9993   Max.   : 1.4809          Max.   : 3.5566     
##  dh_cant_otros_d   dmi_max_ingreso_diario dh_val_otros_d    dh_cant_pagos_d  
##  Min.   :-0.8645   Min.   :-0.5939        Min.   :-0.4234   Min.   :-0.7778  
##  1st Qu.:-0.8645   1st Qu.:-0.5939        1st Qu.:-0.4234   1st Qu.:-0.7778  
##  Median :-0.1101   Median :-0.4010        Median :-0.4165   Median :-0.2944  
##  Mean   : 0.0000   Mean   : 0.0000        Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6443   3rd Qu.: 0.1653        3rd Qu.:-0.0938   3rd Qu.: 0.5113  
##  Max.   : 5.5814   Max.   : 7.2274        Max.   : 7.7619   Max.   : 6.3123  
##  dh_val_pagos_d    pc_cuotas_pagadas dh_min_dia_pagos_d pc_ingreso_por_rutina
##  Min.   :-0.5468   Min.   :-0.6194   Min.   :-0.6552    Min.   :-0.8032      
##  1st Qu.:-0.5468   1st Qu.:-0.6194   1st Qu.:-0.6552    1st Qu.:-0.6827      
##  Median :-0.4491   Median :-0.3631   Median :-0.3244    Median :-0.3695      
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000    Mean   : 0.0000      
##  3rd Qu.: 0.1372   3rd Qu.: 0.1808   3rd Qu.: 0.1718    3rd Qu.: 0.2769      
##  Max.   : 8.6602   Max.   : 6.5910   Max.   : 4.4722    Max.   : 4.4212      
##  cpc_saldo_sobre_ing dh_min_dia_pago_cred_d cpc_saldo_tdc    
##  Min.   :-0.4104     Min.   :-0.6821        Min.   :-0.7940  
##  1st Qu.:-0.3699     1st Qu.:-0.6821        1st Qu.:-0.6288  
##  Median :-0.3519     Median :-0.5430        Median :-0.5864  
##  Mean   : 0.0000     Mean   : 0.0000        Mean   : 0.0000  
##  3rd Qu.:-0.1032     3rd Qu.: 0.2916        3rd Qu.: 0.2453  
##  Max.   :10.5052     Max.   : 3.6301        Max.   : 5.6517  
##  dh_cant_tras_d     cpc_avg_saldo     dh_val_pago_tarj_d
##  Min.   :-0.47777   Min.   :-0.5237   Min.   :-0.3933   
##  1st Qu.:-0.47777   1st Qu.:-0.4361   1st Qu.:-0.3933   
##  Median :-0.47777   Median :-0.3738   Median :-0.3933   
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   
##  3rd Qu.:-0.06338   3rd Qu.:-0.0262   3rd Qu.:-0.1123   
##  Max.   : 5.94527   Max.   : 8.8241   Max.   : 9.4425   
##  flag_tuvo_mora60_ult3meses flag_diaPago_1quincena flag_es_cluster_6
##  Min.   :-0.3841            Min.   :-1.5585        Min.   :-1.0087  
##  1st Qu.:-0.3841            1st Qu.:-1.5585        1st Qu.:-1.0087  
##  Median :-0.3841            Median : 0.6416        Median : 0.9913  
##  Mean   : 0.0000            Mean   : 0.0000        Mean   : 0.0000  
##  3rd Qu.:-0.3841            3rd Qu.: 0.6416        3rd Qu.: 0.9913  
##  Max.   : 2.6035            Max.   : 0.6416        Max.   : 0.9913  
##  flag_tuvo_mora90_ult12M flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
##  Min.   :-0.2625         Min.   :-1.1344             Min.   :-1.4521      
##  1st Qu.:-0.2625         1st Qu.:-1.1344             1st Qu.:-1.4521      
##  Median :-0.2625         Median : 0.8815             Median : 0.6886      
##  Mean   : 0.0000         Mean   : 0.0000             Mean   : 0.0000      
##  3rd Qu.:-0.2625         3rd Qu.: 0.8815             3rd Qu.: 0.6886      
##  Max.   : 3.8087         Max.   : 0.8815             Max.   : 0.6886
# Aplicar scaling al test dataset
# se aplicará usando las medias y desviación del train dataset
Y <- test$Y
test$Y <- NULL
test <- predict(preprocesado, test)
test <- cbind(Y, test)
summary(test)
##        Y             desv_sem         max_mes_anterior     desv_trim       
##  Min.   :0.0000   Min.   :-0.805923   Min.   :-0.50753   Min.   :-1.19411  
##  1st Qu.:0.0000   1st Qu.:-0.642950   1st Qu.:-0.50753   1st Qu.:-1.19411  
##  Median :1.0000   Median :-0.287648   Median :-0.50753   Median :-0.04873  
##  Mean   :0.6847   Mean   :-0.003183   Mean   : 0.00545   Mean   :-0.01659  
##  3rd Qu.:1.0000   3rd Qu.: 0.285270   3rd Qu.: 0.10622   3rd Qu.: 0.66176  
##  Max.   :1.0000   Max.   :16.091238   Max.   :38.70451   Max.   : 7.25586  
##  dh_cant_entradas    pcons_tarjeta_de_credito dh_avg_dia_retiros_d
##  Min.   :-1.060281   Min.   :-0.675202        Min.   :-1.221137   
##  1st Qu.:-1.060281   1st Qu.:-0.675202        1st Qu.:-1.221137   
##  Median :-0.100676   Median :-0.675202        Median : 0.403731   
##  Mean   : 0.002092   Mean   :-0.005479        Mean   : 0.006123   
##  3rd Qu.: 0.760440   3rd Qu.: 1.480935        3rd Qu.: 0.789637   
##  Max.   : 3.756894   Max.   : 1.480935        Max.   : 4.059682   
##  dh_cant_otros_d     dmi_max_ingreso_diario dh_val_otros_d   
##  Min.   :-0.864524   Min.   :-0.593912      Min.   :-0.4234  
##  1st Qu.:-0.864524   1st Qu.:-0.593912      1st Qu.:-0.4234  
##  Median :-0.110089   Median :-0.402591      Median :-0.4194  
##  Mean   : 0.003838   Mean   : 0.008742      Mean   : 0.0152  
##  3rd Qu.: 0.644345   3rd Qu.: 0.173518      3rd Qu.:-0.1013  
##  Max.   : 5.625365   Max.   : 7.227389      Max.   : 7.8260  
##  dh_cant_pagos_d    dh_val_pagos_d     pc_cuotas_pagadas  dh_min_dia_pagos_d 
##  Min.   :-0.77780   Min.   :-0.54682   Min.   :-0.61941   Min.   :-0.655150  
##  1st Qu.:-0.77780   1st Qu.:-0.54682   1st Qu.:-0.61941   1st Qu.:-0.655150  
##  Median :-0.29438   Median :-0.43672   Median :-0.35645   Median :-0.324356  
##  Mean   : 0.02972   Mean   : 0.02651   Mean   : 0.01819   Mean   :-0.007585  
##  3rd Qu.: 0.51131   3rd Qu.: 0.14503   3rd Qu.: 0.16807   3rd Qu.: 0.171836  
##  Max.   : 7.11798   Max.   : 8.64759   Max.   : 6.59100   Max.   : 4.472163  
##  pc_ingreso_por_rutina cpc_saldo_sobre_ing dh_min_dia_pago_cred_d
##  Min.   :-0.80319      Min.   :-0.40385    Min.   :-0.68214      
##  1st Qu.:-0.68304      1st Qu.:-0.36990    1st Qu.:-0.68214      
##  Median :-0.37979      Median :-0.34969    Median :-0.54303      
##  Mean   :-0.01846      Mean   : 0.03398    Mean   :-0.01309      
##  3rd Qu.: 0.27872      3rd Qu.:-0.09272    3rd Qu.: 0.29160      
##  Max.   : 4.41872      Max.   :10.50524    Max.   : 3.63014      
##  cpc_saldo_tdc      dh_cant_tras_d     cpc_avg_saldo      dh_val_pago_tarj_d
##  Min.   :-0.77679   Min.   :-0.47777   Min.   :-0.52372   Min.   :-0.39327  
##  1st Qu.:-0.62885   1st Qu.:-0.47777   1st Qu.:-0.43609   1st Qu.:-0.39327  
##  Median :-0.56706   Median :-0.47777   Median :-0.35621   Median :-0.39327  
##  Mean   : 0.01752   Mean   :-0.01044   Mean   : 0.05172   Mean   : 0.02682  
##  3rd Qu.: 0.26016   3rd Qu.:-0.06338   3rd Qu.:-0.01338   3rd Qu.:-0.08126  
##  Max.   : 5.44829   Max.   : 5.94527   Max.   : 8.49138   Max.   : 9.44247  
##  flag_tuvo_mora60_ult3meses flag_diaPago_1quincena flag_es_cluster_6  
##  Min.   :-0.38408           Min.   :-1.55854       Min.   :-1.008661  
##  1st Qu.:-0.38408           1st Qu.:-1.55854       1st Qu.:-1.008661  
##  Median :-0.38408           Median : 0.64158       Median : 0.991344  
##  Mean   : 0.01079           Mean   :-0.03936       Mean   : 0.002148  
##  3rd Qu.:-0.38408           3rd Qu.: 0.64158       3rd Qu.: 0.991344  
##  Max.   : 2.60348           Max.   : 0.64158       Max.   : 0.991344  
##  flag_tuvo_mora90_ult12M flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
##  Min.   :-0.2625         Min.   :-1.13436            Min.   :-1.45210     
##  1st Qu.:-0.2625         1st Qu.:-1.13436            1st Qu.:-1.45210     
##  Median :-0.2625         Median : 0.88149            Median : 0.68861     
##  Mean   : 0.0003         Mean   :-0.00048            Mean   : 0.01004     
##  3rd Qu.:-0.2625         3rd Qu.: 0.88149            3rd Qu.: 0.68861     
##  Max.   : 3.8087         Max.   : 0.88149            Max.   : 0.68861

Balanceo

Se utilizará el método SMOTE para balancear las clases de la variable respuesta

temp <- train
temp$Y <- as.factor(temp$Y)
class(temp$Y)
## [1] "factor"
S <- SMOTE(Y ~ ., data = temp, perc.over = 100)
table(S$Y)
## 
##    0    1 
## 8904 8904
S$Y <- as.numeric(S$Y) - 1
table(S$Y)
## 
##    0    1 
## 8904 8904
train2 <- S

Regresión Logística

# variables comentados por VIF > 4 o PValue muy alto
modelo_logistica <- glm(
  Y ~ 
    desv_sem
  + max_mes_anterior
  + desv_trim
  + dh_cant_entradas
  + pcons_tarjeta_de_credito
  + dh_avg_dia_retiros_d
  #+ dh_cant_otros_d
  #+ dmi_max_ingreso_diario
  #+ dh_val_otros_d
  + dh_cant_pagos_d
  #+ dh_val_pagos_d
  + pc_cuotas_pagadas
  + dh_min_dia_pagos_d
  + pc_ingreso_por_rutina
  + cpc_saldo_sobre_ing
  + dh_min_dia_pago_cred_d
  + cpc_saldo_tdc
  #+ dh_cant_tras_d
  + cpc_avg_saldo
  + dh_val_pago_tarj_d
  + flag_tuvo_mora60_ult3meses
  #+ flag_diaPago_1quincena
  + flag_es_cluster_6
  + flag_tuvo_mora90_ult12M
  #+ flax_maxDiaOtrosD_1quincena
  + flag_encontrado_cifin
  , data = train2
  , family = "binomial"
)

# Prueba VIF
vif(modelo_logistica)
##                   desv_sem           max_mes_anterior 
##                   2.498924                   1.438054 
##                  desv_trim           dh_cant_entradas 
##                   2.428364                   3.044172 
##   pcons_tarjeta_de_credito       dh_avg_dia_retiros_d 
##                   1.093417                   3.082506 
##            dh_cant_pagos_d          pc_cuotas_pagadas 
##                   2.620900                   1.221223 
##         dh_min_dia_pagos_d      pc_ingreso_por_rutina 
##                   2.120878                   1.195525 
##        cpc_saldo_sobre_ing     dh_min_dia_pago_cred_d 
##                   2.036106                   2.013672 
##              cpc_saldo_tdc              cpc_avg_saldo 
##                   1.507262                   2.061050 
##         dh_val_pago_tarj_d flag_tuvo_mora60_ult3meses 
##                   1.401443                   1.691738 
##          flag_es_cluster_6    flag_tuvo_mora90_ult12M 
##                   1.202198                   1.802167 
##      flag_encontrado_cifin 
##                   1.551095
# PVaues
summary(modelo_logistica)
## 
## Call:
## glm(formula = Y ~ desv_sem + max_mes_anterior + desv_trim + dh_cant_entradas + 
##     pcons_tarjeta_de_credito + dh_avg_dia_retiros_d + dh_cant_pagos_d + 
##     pc_cuotas_pagadas + dh_min_dia_pagos_d + pc_ingreso_por_rutina + 
##     cpc_saldo_sobre_ing + dh_min_dia_pago_cred_d + cpc_saldo_tdc + 
##     cpc_avg_saldo + dh_val_pago_tarj_d + flag_tuvo_mora60_ult3meses + 
##     flag_es_cluster_6 + flag_tuvo_mora90_ult12M + flag_encontrado_cifin, 
##     family = "binomial", data = train2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0183  -1.1163   0.2137   1.0492   3.5088  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 0.054021   0.016054   3.365 0.000765 ***
## desv_sem                   -0.416694   0.032800 -12.704  < 2e-16 ***
## max_mes_anterior           -0.198661   0.026386  -7.529 5.11e-14 ***
## desv_trim                  -0.054705   0.026350  -2.076 0.037887 *  
## dh_cant_entradas            0.337139   0.028536  11.815  < 2e-16 ***
## pcons_tarjeta_de_credito   -0.108810   0.016486  -6.600 4.11e-11 ***
## dh_avg_dia_retiros_d       -0.101548   0.028240  -3.596 0.000323 ***
## dh_cant_pagos_d            -0.068394   0.025683  -2.663 0.007745 ** 
## pc_cuotas_pagadas           0.016334   0.017699   0.923 0.356092    
## dh_min_dia_pagos_d         -0.013568   0.023460  -0.578 0.563019    
## pc_ingreso_por_rutina       0.007065   0.017877   0.395 0.692695    
## cpc_saldo_sobre_ing        -0.024267   0.023083  -1.051 0.293124    
## dh_min_dia_pago_cred_d      0.075297   0.023111   3.258 0.001122 ** 
## cpc_saldo_tdc              -0.177958   0.019884  -8.950  < 2e-16 ***
## cpc_avg_saldo               0.032348   0.023794   1.360 0.173984    
## dh_val_pago_tarj_d          0.069909   0.019983   3.498 0.000468 ***
## flag_tuvo_mora60_ult3meses -0.169527   0.020893  -8.114 4.90e-16 ***
## flag_es_cluster_6           0.084413   0.017430   4.843 1.28e-06 ***
## flag_tuvo_mora90_ult12M    -0.138459   0.021116  -6.557 5.49e-11 ***
## flag_encontrado_cifin      -0.317650   0.020251 -15.685  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24687  on 17807  degrees of freedom
## Residual deviance: 22554  on 17788  degrees of freedom
## AIC: 22594
## 
## Number of Fisher Scoring iterations: 4
# Evaluar Performance
rendimiento <- function(modelo, dataset, corte)
{
  #modelo <- modelo_logistica
  #dataset <- test
  #corte <- 0.5
  R <- test$Y
  PP <- as.numeric(predict(modelo_logistica, test, type = "response"))
  # hist(PP)
  P <- ifelse(PP >= corte, 1, 0)
  tabla <- table(R,P)
  acc <- round(sum(diag(tabla)) / sum(tabla),2)
  auc <- round(as.numeric(roc(R,P)$auc),2)
  gini <- (2 * auc) - 1
  T <- as.data.frame(cbind(R,P))
  TNR <- round(nrow(subset(T, T$R == 0 & T$P == 0)) / nrow(subset(T, T$R == 0)),2)
  TPR <- round(nrow(subset(T, T$R == 1 & T$P == 1)) / nrow(subset(T, T$R == 1)),2)
  R <- as.data.frame(cbind(corte, acc, auc, gini, TNR, TPR))
  return(R)
}

# Buscar mejor punto de corte
rendimiento(modelo_logistica, test, 0.5)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.5 0.65 0.63 0.26 0.58 0.68
# buscar mejor punto de corte
for(i in seq(0.2, 0.8, by = 0.01))
{
  print(rendimiento(modelo_logistica, test, i))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini TNR  TPR
## 1   0.2 0.7 0.54 0.08 0.1 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.21 0.7 0.54 0.08 0.11 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.22 0.7 0.54 0.08 0.11 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.23 0.7 0.54 0.08 0.12 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.24 0.7 0.55  0.1 0.13 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.25 0.7 0.55  0.1 0.13 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.26 0.7 0.55  0.1 0.14 0.95
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.27 0.7 0.55  0.1 0.15 0.95
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.28 0.7 0.56 0.12 0.17 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.29 0.7 0.56 0.12 0.18 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini TNR  TPR
## 1   0.3 0.7 0.57 0.14 0.2 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.31 0.7 0.57 0.14 0.21 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.32 0.7 0.58 0.16 0.23 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.33 0.71 0.58 0.16 0.24 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.34 0.71 0.59 0.18 0.27 0.91
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR TPR
## 1  0.35 0.71 0.6  0.2 0.29 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini TNR TPR
## 1  0.36 0.71 0.6  0.2 0.3 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.37 0.71 0.6  0.2 0.32 0.88
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.38 0.7 0.61 0.22 0.34 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.39 0.7 0.61 0.22 0.35 0.86
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1   0.4 0.7 0.61 0.22 0.36 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.41 0.7 0.61 0.22 0.38 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.42 0.69 0.61 0.22 0.4 0.82
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.43 0.68 0.61 0.22 0.42 0.81
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.44 0.68 0.62 0.24 0.45 0.79
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.45 0.68 0.62 0.24 0.47 0.77
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.46 0.67 0.63 0.26 0.5 0.75
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.47 0.67 0.62 0.24 0.51 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.48 0.66 0.62 0.24 0.54 0.71
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.49 0.65 0.62 0.24 0.55 0.69
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.5 0.65 0.63 0.26 0.58 0.68
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.51 0.64 0.63 0.26 0.6 0.66
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.52 0.63 0.63 0.26 0.62 0.63
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.53 0.62 0.63 0.26 0.65 0.6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.54 0.61 0.63 0.26 0.68 0.58
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.55 0.6 0.62 0.24 0.69 0.56
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.56 0.59 0.62 0.24 0.71 0.53
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.57 0.57 0.61 0.22 0.73 0.5
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.58 0.56 0.61 0.22 0.76 0.47
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.59 0.54 0.61 0.22 0.77 0.44
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1   0.6 0.53 0.6  0.2 0.79 0.41
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.61 0.52 0.59 0.18 0.8 0.38
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.62 0.5 0.59 0.18 0.83 0.36
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.63 0.49 0.58 0.16 0.84 0.33
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.64 0.48 0.58 0.16 0.86 0.31
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.65 0.47 0.58 0.16 0.87 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.66 0.46 0.58 0.16 0.89 0.26
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.67 0.44 0.57 0.14 0.9 0.23
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.68 0.43 0.56 0.12 0.92 0.21
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.69 0.42 0.56 0.12 0.93 0.18
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1   0.7 0.4 0.55  0.1 0.94 0.16
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.71 0.39 0.54 0.08 0.95 0.13
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.72 0.38 0.53 0.06 0.96 0.11
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.73 0.36 0.53 0.06 0.97 0.08
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.74 0.35 0.52 0.04 0.97 0.07
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.75 0.35 0.52 0.04 0.98 0.06
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.76 0.33 0.51 0.02 0.99 0.03
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.77 0.33 0.51 0.02 0.99 0.03
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.78 0.32 0.5    0 0.99 0.02
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.79 0.32 0.5    0 0.99 0.01
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1   0.8 0.32 0.5    0 0.99 0.01
# Finalmente
resultado1 <- as.data.frame(cbind("RegLog", rendimiento(modelo_logistica, test, 0.5)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(resultado1)[1] <- "Modelo"
print(resultado1)
##   Modelo corte  acc  auc gini  TNR  TPR
## 1 RegLog   0.5 0.65 0.63 0.26 0.58 0.68

Random Forest

Se entrenó un random forest tanto con datos sin balancear como con datos balanceados a través del método SMOTE (páquete DMwR); finalmente, el 2° random forest presentó el mejor AUC con un punto de corte de 0.5

# Usar el FOR loop para buscar los mejores parámetros

# Evaluar Performance
rendimiento <- function(modelo, dataset, corte)
{
  # modelo <- RF1
  # dataset <- test
  # corte <- 0.5
  R <- test$Y
  PP <- as.numeric(predict(modelo, dataset, type = "prob")[,2])
  # hist(PP)
  P <- ifelse(PP >= corte, 1, 0)
  tabla <- table(R,P)
  acc <- round(sum(diag(tabla)) / sum(tabla),2)
  auc <- round(as.numeric(roc(R,P)$auc),2)
  gini <- (2 * auc) - 1
  T <- as.data.frame(cbind(R,P))
  TNR <- round(nrow(subset(T, T$R == 0 & T$P == 0)) / nrow(subset(T, T$R == 0)),2)
  TPR <- round(nrow(subset(T, T$R == 1 & T$P == 1)) / nrow(subset(T, T$R == 1)),2)
  R <- as.data.frame(cbind(corte, acc, auc, gini, TNR, TPR))
  return(R)
}

# random forest con data sin balancear
for(parametro in 1:1)
{
  # Modelado
  set.seed(1)
  RF1 <- randomForest(
    as.factor(Y) ~ .
    , data = train
    , ntree = 250
    , mtry = 7
    , classwt = c(1.5, 1)
  )
  
  # Evaluar performance
  print(cbind(parametro, rendimiento(RF1, test, 0.5)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   parametro corte  acc  auc gini  TNR  TPR
## 1         1   0.5 0.75 0.67 0.34 0.44 0.89
# random forest con data balanceada
for(parametro in 1:1)
{
  # Modelado
  set.seed(1)
  RF2 <- randomForest(
    as.factor(Y) ~ .
    , data = train2
    , ntree = 100
    , mtry = 7
    , replace = TRUE
  )
  
  # Evaluar performance
  print(cbind(parametro, rendimiento(RF2, test, 0.5)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   parametro corte  acc  auc gini TNR  TPR
## 1         1   0.5 0.71 0.71 0.42 0.7 0.71
# Evaluar todos los puntos de corte
for(i in seq(0.2, 0.8, by = 0.01))
{
  print(rendimiento(RF2, test, i))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1   0.2 0.74 0.6  0.2 0.23 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.21 0.74 0.61 0.22 0.26 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.22 0.74 0.61 0.22 0.26 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.23 0.74 0.62 0.24 0.28 0.95
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.24 0.74 0.62 0.24 0.3 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.25 0.74 0.62 0.24 0.3 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.26 0.74 0.63 0.26 0.32 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.27 0.75 0.64 0.28 0.34 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.28 0.75 0.64 0.28 0.36 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.29 0.75 0.65  0.3 0.4 0.91
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1   0.3 0.75 0.66 0.32 0.41 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.31 0.75 0.66 0.32 0.41 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.32 0.74 0.66 0.32 0.42 0.89
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.33 0.75 0.66 0.32 0.44 0.89
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.34 0.75 0.67 0.34 0.46 0.88
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.35 0.74 0.67 0.34 0.47 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.36 0.74 0.68 0.36 0.49 0.86
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.37 0.74 0.68 0.36 0.5 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.38 0.74 0.68 0.36 0.52 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.39 0.74 0.69 0.38 0.54 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.4 0.74 0.69 0.38 0.56 0.82
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.41 0.73 0.69 0.38 0.59 0.8
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.42 0.73 0.69 0.38 0.6 0.79
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.43 0.73 0.7  0.4 0.61 0.78
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.44 0.73 0.7  0.4 0.61 0.78
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.45 0.72 0.7  0.4 0.63 0.77
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.46 0.72 0.7  0.4 0.64 0.76
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.47 0.72 0.7  0.4 0.67 0.74
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.48 0.71 0.7  0.4 0.68 0.72
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.49 0.71 0.7  0.4 0.68 0.72
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1   0.5 0.71 0.71 0.42 0.7 0.71
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc auc gini  TNR TPR
## 1  0.51 0.7 0.7  0.4 0.71 0.7
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc auc gini  TNR  TPR
## 1  0.52 0.7 0.7  0.4 0.72 0.69
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.53 0.69 0.7  0.4 0.73 0.67
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.54 0.68 0.7  0.4 0.74 0.65
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.55 0.68 0.7  0.4 0.76 0.64
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1  0.56 0.67 0.7  0.4 0.77 0.62
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR TPR
## 1  0.57 0.66 0.7  0.4 0.79 0.6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.58 0.65 0.69 0.38 0.8 0.59
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.59 0.65 0.69 0.38 0.82 0.57
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.6 0.64 0.69 0.38 0.82 0.55
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.61 0.63 0.68 0.36 0.83 0.53
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.62 0.63 0.68 0.36 0.83 0.53
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.63 0.62 0.68 0.36 0.85 0.52
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.64 0.61 0.68 0.36 0.86 0.5
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.65 0.61 0.68 0.36 0.87 0.48
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.66 0.6 0.67 0.34 0.88 0.47
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.67 0.59 0.67 0.34 0.88 0.45
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.68 0.58 0.66 0.32 0.89 0.44
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.69 0.57 0.66 0.32 0.89 0.42
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR TPR
## 1   0.7 0.56 0.65  0.3 0.9 0.4
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.71 0.55 0.64 0.28 0.9 0.38
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.72 0.54 0.64 0.28 0.91 0.37
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.73 0.53 0.63 0.26 0.91 0.35
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.74 0.52 0.62 0.24 0.92 0.33
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.75 0.51 0.62 0.24 0.92 0.32
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR TPR
## 1  0.76 0.5 0.62 0.24 0.93 0.3
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.77 0.49 0.61 0.22 0.94 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.78 0.49 0.61 0.22 0.94 0.28
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.79 0.48 0.61 0.22 0.95 0.27
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc auc gini  TNR  TPR
## 1   0.8 0.47 0.6  0.2 0.95 0.25
# finalmente:
resultado2 <- as.data.frame(cbind("RandomForest", rendimiento(RF2, test, 0.5)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(resultado2)[1] <- "Modelo"
print(resultado2)
##         Modelo corte  acc  auc gini TNR  TPR
## 1 RandomForest   0.5 0.71 0.71 0.42 0.7 0.71
# Adempas, las probabilidades obtenidas tienen la siguiente forma:
hist(as.numeric(predict(RF2, test, type = "prob")[,2]))

AdaBoost

# Preparar todo para el modelo
usar <- train2
usar$Y <- as.factor(usar$Y)
set.seed(1)

# Entrenar modelo con todas las variables por default
modeloAda <- boosting(
  Y ~ 
    desv_sem   # Importance: 33.7176260590549
  +   max_mes_anterior   # Importance: 18.8506694661822
  +   dh_cant_tras_d   # Importance: 6.66562441816714
  +   dh_min_dia_pagos_d   # Importance: 5.74571007099648
  +   flag_tuvo_mora60_ult3meses   # Importance: 5.27516596907519
  +   pc_cuotas_pagadas   # Importance: 4.94772841710666
  +   dh_cant_entradas   # Importance: 3.6451690230774
  +   flag_encontrado_cifin   # Importance: 3.51978408061676
  +   pc_ingreso_por_rutina   # Importance: 3.20238547917842
  +   desv_trim   # Importance: 3.06059257514681
  +   cpc_saldo_tdc   # Importance: 2.89533364487582
  +   dh_cant_otros_d   # Importance: 2.49716825059181
  +   dh_val_pago_tarj_d   # Importance: 1.03252217294607
  +   dh_cant_pagos_d   # Importance: 0.899084100587163
  +   cpc_avg_saldo   # Importance: 0.860227126123119
  +   dh_min_dia_pago_cred_d   # Importance: 0.790252390234587
  +   pcons_tarjeta_de_credito   # Importance: 0.623763602744435
  +   dh_val_otros_d   # Importance: 0.525760920959076
  +   dmi_max_ingreso_diario   # Importance: 0.316415754836402
  +   cpc_saldo_sobre_ing   # Importance: 0.313236673318644
  +   dh_val_pagos_d   # Importance: 0.266451837486692
  +   dh_avg_dia_retiros_d   # Importance: 0.168203660652142
  +   flag_diaPago_1quincena   # Importance: 0.104916028226669
  +   flag_es_cluster_6   # Importance: 0.0762082778154691
  +   flag_tuvo_mora90_ult12M   # Importance: 0
  +   flax_maxDiaOtrosD_1quincena   # Importance: 0
  , data = usar
)


# Imprimir importancia
modeloAda$importance # importancia de las variables
##               cpc_avg_saldo         cpc_saldo_sobre_ing 
##                  0.62945995                  0.62137479 
##               cpc_saldo_tdc                    desv_sem 
##                  2.02408780                 29.34513646 
##                   desv_trim        dh_avg_dia_retiros_d 
##                  5.06654437                  0.38408579 
##            dh_cant_entradas             dh_cant_otros_d 
##                  3.96578792                  2.53680494 
##             dh_cant_pagos_d              dh_cant_tras_d 
##                  1.01793538                 11.50905232 
##      dh_min_dia_pago_cred_d          dh_min_dia_pagos_d 
##                  1.07949399                  5.87323983 
##              dh_val_otros_d          dh_val_pago_tarj_d 
##                  0.48679549                  1.63684079 
##              dh_val_pagos_d      dmi_max_ingreso_diario 
##                  0.45164458                  0.56812405 
##      flag_diaPago_1quincena       flag_encontrado_cifin 
##                  0.04640731                  2.98015468 
##           flag_es_cluster_6  flag_tuvo_mora60_ult3meses 
##                  0.23381769                  3.30989601 
##     flag_tuvo_mora90_ult12M flax_maxDiaOtrosD_1quincena 
##                  0.23937819                  0.00000000 
##            max_mes_anterior           pc_cuotas_pagadas 
##                 18.36357557                  4.13535714 
##       pc_ingreso_por_rutina    pcons_tarjeta_de_credito 
##                  2.78858873                  0.70641620
# Evaluar Performance del modelo
rendimiento <- function(modelo, dataset, corte)
{
  # modelo <- modeloAda
  # dataset <- test
  # corte <- 0.5
  R <- test$Y
  PP <- (as.numeric(predict(modelo, dataset)$prob[,2]))
  # hist(PP)
  P <- ifelse(PP >= corte, 1, 0)
  tabla <- table(R,P)
  acc <- round(sum(diag(tabla)) / sum(tabla),2)
  auc <- round(as.numeric(roc(R,P)$auc),2)
  gini <- (2 * auc) - 1
  T <- as.data.frame(cbind(R,P))
  TNR <- round(nrow(subset(T, T$R == 0 & T$P == 0)) / nrow(subset(T, T$R == 0)),2)
  TPR <- round(nrow(subset(T, T$R == 1 & T$P == 1)) / nrow(subset(T, T$R == 1)),2)
  R <- as.data.frame(cbind(corte, acc, auc, gini, TNR, TPR))
  return(R)
}


# Modelo omitiendo algunas variables
modeloAda <- boosting(
  Y ~ 
    desv_sem   # Importance: 33.7176260590549
  +   max_mes_anterior   # Importance: 18.8506694661822
  +   dh_cant_tras_d   # Importance: 6.66562441816714
  +   dh_min_dia_pagos_d   # Importance: 5.74571007099648
  +   flag_tuvo_mora60_ult3meses   # Importance: 5.27516596907519
  +   pc_cuotas_pagadas   # Importance: 4.94772841710666
  +   dh_cant_entradas   # Importance: 3.6451690230774
  +   flag_encontrado_cifin   # Importance: 3.51978408061676
  +   pc_ingreso_por_rutina   # Importance: 3.20238547917842
  +   desv_trim   # Importance: 3.06059257514681
  +   cpc_saldo_tdc   # Importance: 2.89533364487582
  +   dh_cant_otros_d   # Importance: 2.49716825059181
  +   dh_val_pago_tarj_d   # Importance: 1.03252217294607
  #+   dh_cant_pagos_d   # Importance: 0.899084100587163
  #+   cpc_avg_saldo   # Importance: 0.860227126123119
  #+   dh_min_dia_pago_cred_d   # Importance: 0.790252390234587
  #+   pcons_tarjeta_de_credito   # Importance: 0.623763602744435
  #+   dh_val_otros_d   # Importance: 0.525760920959076
  #+   dmi_max_ingreso_diario   # Importance: 0.316415754836402
  #+   cpc_saldo_sobre_ing   # Importance: 0.313236673318644
  #+   dh_val_pagos_d   # Importance: 0.266451837486692
  #+   dh_avg_dia_retiros_d   # Importance: 0.168203660652142
  #+   flag_diaPago_1quincena   # Importance: 0.104916028226669
  #+   flag_es_cluster_6   # Importance: 0.0762082778154691
  #+   flag_tuvo_mora90_ult12M   # Importance: 0
  #+   flax_maxDiaOtrosD_1quincena   # Importance: 0
  , data = usar
  , mfinal = 10
  , boos = TRUE
)

# Performance del modelo
rendimiento(modeloAda, test, 0.5)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.5 0.66 0.64 0.28 0.59 0.69
# Buscar mejor punto de corte
for(i in seq(0.2, 0.8, by = 0.01))
{
  print(rendimiento(modeloAda, test, i))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.2 0.71 0.58 0.16 0.22 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.21 0.71 0.58 0.16 0.25 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.22 0.71 0.58 0.16 0.25 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.23 0.7 0.59 0.18 0.26 0.91
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR TPR
## 1  0.24 0.7 0.59 0.18 0.27 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini TNR  TPR
## 1  0.25 0.7 0.59 0.18 0.3 0.89
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc auc gini  TNR  TPR
## 1  0.26 0.7 0.6  0.2 0.31 0.88
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc auc gini  TNR  TPR
## 1  0.27 0.7 0.6  0.2 0.33 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc auc gini  TNR  TPR
## 1  0.28 0.7 0.6  0.2 0.34 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc auc gini  TNR  TPR
## 1  0.29 0.7 0.6  0.2 0.35 0.86
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1   0.3 0.7 0.61 0.22 0.36 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.31 0.7 0.61 0.22 0.37 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.32 0.7 0.61 0.22 0.38 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.33 0.69 0.61 0.22 0.39 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.34 0.69 0.62 0.24 0.41 0.83
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.35 0.69 0.62 0.24 0.41 0.82
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.36 0.69 0.62 0.24 0.43 0.81
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.37 0.69 0.62 0.24 0.45 0.8
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.38 0.69 0.62 0.24 0.45 0.8
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.39 0.69 0.63 0.26 0.46 0.79
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.4 0.68 0.63 0.26 0.48 0.77
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.41 0.67 0.63 0.26 0.5 0.76
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.42 0.68 0.63 0.26 0.51 0.75
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.43 0.68 0.63 0.26 0.52 0.75
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.44 0.67 0.63 0.26 0.53 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.45 0.67 0.63 0.26 0.54 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.46 0.67 0.63 0.26 0.54 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.47 0.67 0.64 0.28 0.56 0.72
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.48 0.66 0.64 0.28 0.56 0.71
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.49 0.66 0.64 0.28 0.57 0.7
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.5 0.66 0.64 0.28 0.59 0.69
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.51 0.65 0.64 0.28 0.6 0.68
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.52 0.65 0.64 0.28 0.6 0.67
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.53 0.65 0.64 0.28 0.61 0.67
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.54 0.64 0.64 0.28 0.62 0.66
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.55 0.64 0.64 0.28 0.63 0.64
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.56 0.63 0.64 0.28 0.64 0.63
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.57 0.63 0.64 0.28 0.66 0.61
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.58 0.62 0.64 0.28 0.68 0.6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.59 0.62 0.64 0.28 0.69 0.59
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1   0.6 0.62 0.64 0.28 0.69 0.59
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.61 0.61 0.63 0.26 0.7 0.57
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1  0.62 0.61 0.63 0.26 0.7 0.56
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.63 0.6 0.64 0.28 0.72 0.55
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.64 0.59 0.63 0.26 0.74 0.52
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.65 0.59 0.63 0.26 0.74 0.52
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.66 0.58 0.63 0.26 0.75 0.51
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.67 0.57 0.62 0.24 0.75 0.49
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.68 0.56 0.62 0.24 0.77 0.47
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.69 0.56 0.62 0.24 0.79 0.45
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1   0.7 0.55 0.62 0.24 0.8 0.44
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.71 0.55 0.62 0.24 0.81 0.42
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.72 0.54 0.62 0.24 0.81 0.42
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR TPR
## 1  0.73 0.54 0.61 0.22 0.82 0.4
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte acc  auc gini  TNR  TPR
## 1  0.74 0.5 0.59 0.18 0.86 0.33
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.75 0.49 0.59 0.18 0.86 0.32
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.76 0.49 0.59 0.18 0.86 0.32
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.77 0.48 0.59 0.18 0.87 0.31
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.78 0.48 0.58 0.16 0.87 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini  TNR  TPR
## 1  0.79 0.47 0.58 0.16 0.87 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##   corte  acc  auc gini TNR  TPR
## 1   0.8 0.46 0.58 0.16 0.9 0.26
resultado3 <- cbind("AdaBoost", rendimiento(modeloAda, test, 0.56))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(resultado3)[1] <- "Modelo"
resultado3
##     Modelo corte  acc  auc gini  TNR  TPR
## 1 AdaBoost  0.56 0.63 0.64 0.28 0.64 0.63

Torneo de modelos

El modelo ganador es un Random Forest usando la data balanceada, ntree = 100 y mtry = 7. Este modelo nos da un AUC de 0.71 y un GINI de 0.42. Dado que el AUC es >= 0.70 el modelo se considera aceptable y viable para su utilización, pero se recomiendo buscar un AUC >= 0.80 a través de:

# Finalmente...

rbind(
  resultado1,
  resultado2,
  resultado3
)
##         Modelo corte  acc  auc gini  TNR  TPR
## 1       RegLog  0.50 0.65 0.63 0.26 0.58 0.68
## 2 RandomForest  0.50 0.71 0.71 0.42 0.70 0.71
## 3     AdaBoost  0.56 0.63 0.64 0.28 0.64 0.63

OOT dataset y creación de base_prueba evaluado

original <- read.csv("Base_prueba.csv", header = TRUE)
dim(original) # cargadas 20,000 filas y 124 columnas
## [1] 1000  123
data <- original # sobre este dataset se hará el feature engineer

# Omitir datos atípicos

A <- nrow(data)
data <- subset(data, data$pc_transaccional <= 136903394) 
data <- subset(data, data$dmi_max_egreso_diario <= 85362328)
data <- subset(data, data$dmi_max_ingreso_diario <= 104909227.3)
data <- subset(data, data$dh_val_otros_d <= 9998335.80)
data <- subset(data, data$dmi_ingreso_total_mes <= 205163705.0)
data <- subset(data, data$dh_val_pagos_d <= 50357360.00)
data <- subset(data, data$pc_gasto_familiar <= 26582180.19)
data <- subset(data, data$pc_cuotas_pagadas <= 8156850.0)
data <- subset(data, data$dmi_egreso_total_mes <= 209829245.0)
data <- subset(data, data$dh_val_salidas <= 217011100)
data <- subset(data, data$cpc_sum_saldo <= 220905770.00)
data <- subset(data, data$cpc_saldo_sobre_ing <= 20.307294513)
data <- subset(data, data$cpc_saldo_tdc <= 43186883.96)
data <- subset(data, data$dh_cant_tras_d <= 31)
data <- subset(data, data$cpc_avg_saldo <= 106884393.50)
data <- subset(data, data$dh_val_pago_tarj_d <= 10560315.0)
data <- subset(data, data$pc_transaccional <= 92269310.0)
B <- nrow(data)

paste(A-B,"registros omitidos",sep = " ")
## [1] "98 registros omitidos"
# Exploración y Transformación de variables

names(data)[1] # max_trim: máximo días de mora trimestre anterior
## [1] "max_trim"
hist(data$max_trim)

hist(sqrt(data$max_trim)) # variable normalizada con sqrt

max_trim <- sqrt(data$max_trim)

names(data)[2] # max_sem: máximo días de mora en el semestre anterior
## [1] "max_sem"
hist(data$max_sem)

hist(sqrt(data$max_sem))

max_sem <- data$max_sem

names(data)[3] # desv_sem: desviación estándar del máximo de moras en el semestre anterior
## [1] "desv_sem"
hist(data$desv_sem)

hist(sqrt(data$desv_sem))

desv_sem <- data$desv_sem

names(data)[4] # prom_bim: promedio del máximo de moras en el bimestre anterior
## [1] "prom_bim"
hist(data$prom_bim)

hist(sqrt(data$prom_bim))

prom_bim <- sqrt(data$prom_bim)

names(data)[5] # max_mes_anterior: días de mora máximo en el mes anterior
## [1] "max_mes_anterior"
hist(data$max_mes_anterior)

hist(sqrt(data$max_mes_anterior))

max_mes_anterior <- data$max_mes_anterior

names(data)[6] # prom_mes_anterior: Promedio de los dias de mora en el mes anterior
## [1] "prom_mes_anterior"
hist(data$prom_mes_anterior)

hist(sqrt(data$prom_mes_anterior))

prom_mes_anterior <- sqrt(data$prom_mes_anterior)

names(data)[7] # prom_sem: promedio del máximo de moras en el semestre anterior
## [1] "prom_sem"
hist(data$prom_sem)

hist(sqrt(data$prom_sem))

prom_sem <- data$prom_sem

names(data)[8] # max_bim: máximo días de mora en el bimestre anterior
## [1] "max_bim"
hist(data$max_bim)

hist(sqrt(data$max_bim))

max_bim <- data$max_bim

names(data)[9] # mejor_gestion: Mejor gestion realizada
## [1] "mejor_gestion"
table(data$mejor_gestion)
## 
##   0  15 
## 901   1
names(data)[10] # prom_trim: promedio del máximo de moras en el trimestre anterior
## [1] "prom_trim"
hist(data$prom_trim)

hist(sqrt(data$prom_trim))

prom_trim <- sqrt(data$prom_trim)

names(data)[11] # pc_cant_moras_30_ult_12_meses: Cantidad de moras 30 en los últimos 12 meses <= 5 o vacío.
## [1] "pc_cant_moras_30_ult_12_meses"
summary(data$pc_cant_moras_30_ult_12_meses)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.8758  1.0000 12.0000
table(data$pc_cant_moras_30_ult_12_meses)
## 
##   0   1   2   3   4   6   7   8   9  10  11  12 
## 639 124  24  31  32  18  12   3  12   5   1   1
pc_cant_moras_30_ult_12_meses <- data$pc_cant_moras_30_ult_12_meses

names(data)[12] # desv_trim: desviación estándar del máximo de moras en el trimestre  anterior
## [1] "desv_trim"
hist(data$desv_trim)

hist(sqrt(data$desv_trim))

desv_trim <- sqrt(data$desv_trim)

names(data)[13] # nro_gestiones: Numero de gestiones realizadas
## [1] "nro_gestiones"
table(data$nro_gestiones)
## 
##   0   2 
## 901   1
names(data)[14] # desv_bim: desviación estándar del máximo días de mora en el bimestre anterior
## [1] "desv_bim"
hist(data$desv_bim)

hist(sqrt(data$desv_bim))

desv_bim <- sqrt(data$desv_bim)

names(data)[15] # pc_cant_moras_30_ult_3_meses: Cantidad de moras 30 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_30_ult_3_meses"
summary(data$pc_cant_moras_30_ult_3_meses)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3869  0.0000  3.0000
hist(data$pc_cant_moras_30_ult_3_meses)

names(data)[16] # dh_cant_entradas: cantidad de trasacciones de ingreso de dinero tuvo en el mes anterior
## [1] "dh_cant_entradas"
summary(data$dh_cant_entradas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    6.00   11.59   15.00  129.00
hist(data$dh_cant_entradas)

hist(sqrt(data$dh_cant_entradas))

dh_cant_entradas <- sqrt(data$dh_cant_entradas)

names(data)[17] # pc_tiem_1er_prod_abierto_total: Tiempo total de producto abierto
## [1] "pc_tiem_1er_prod_abierto_total"
table(data$pc_tiem_1er_prod_abierto_total)
## 
##   0  34  63  65  66 122 
## 897   1   1   1   1   1
hist(data$pc_tiem_1er_prod_abierto_total)

names(data)[18] # pc_cant_moras_60_ult_12_meses: Cantidad de moras 60 en los últimos 12 meses <= 1 o vacío.
## [1] "pc_cant_moras_60_ult_12_meses"
table(data$pc_cant_moras_60_ult_12_meses)
## 
##   0   1   2   3   4   5   6   7   8   9  12 
## 778  27  35  23   5   5   6  11   6   5   1
hist(data$pc_cant_moras_60_ult_12_meses) # se optará por hacerla binaria

x <- ifelse(data$pc_cant_moras_60_ult_12_meses >= 1, 1, 0)
table(x)
## x
##   0   1 
## 778 124
flag_mora60_ult12meses <- x

names(data)[19] # gestiones_eficaces: Cantidad de gestiones eficaces
## [1] "gestiones_eficaces"
table(data$gestiones_eficaces)
## 
##   0   1 
## 901   1
names(data)[20] # pc_transaccional: Ingreso de acuerdo al estimador transaccional del cliente
## [1] "pc_transaccional"
summary(data$pc_transaccional)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  3520425  8811787 15070019 20470286 87552251
boxplot(data$pc_transaccional) # valores atípicos en el lado superior de la variable

quantile(data$pc_transaccional, seq(0, 1, by = 0.01))
##       0%       1%       2%       3%       4%       5%       6%       7% 
##        0        0        0        0        0        0        0        0 
##       8%       9%      10%      11%      12%      13%      14%      15% 
##        0        0        0        0        0    12000   125570   434500 
##      16%      17%      18%      19%      20%      21%      22%      23% 
##   931700  1232876  1583097  2327651  2390242  2598273  2905406  3127205 
##      24%      25%      26%      27%      28%      29%      30%      31% 
##  3327102  3520425  3654249  3880363  3900000  4154062  4341266  4785855 
##      32%      33%      34%      35%      36%      37%      38%      39% 
##  4785855  5056821  5270000  5446336  5684445  5822250  6001779  6187601 
##      40%      41%      42%      43%      44%      45%      46%      47% 
##  6392479  6392479  6487250  6531800  6707020  6757348  6964516  7264240 
##      48%      49%      50%      51%      52%      53%      54%      55% 
##  7654414  8280285  8811787  9280070  9411475  9826828 10069905 10765247 
##      56%      57%      58%      59%      60%      61%      62%      63% 
## 11115000 11212284 11863116 12310765 12733000 12796336 13160097 13700000 
##      64%      65%      66%      67%      68%      69%      70%      71% 
## 14952138 15247194 15386678 15532112 16000000 16566361 16929221 18004000 
##      72%      73%      74%      75%      76%      77%      78%      79% 
## 18543000 18662584 20000000 20470286 20470286 21185026 21500000 22029301 
##      80%      81%      82%      83%      84%      85%      86%      87% 
## 24256619 24256619 24613534 25494191 27920966 28156019 29246260 30636482 
##      88%      89%      90%      91%      92%      93%      94%      95% 
## 33093800 37612700 37940660 42301382 42852690 45546580 54933792 55731375 
##      96%      97%      98%      99%     100% 
## 59325287 64094730 78140440 78196633 87552251
x <- subset(data$pc_transaccional, data$pc_transaccional <= 92269310.0)
hist(x)

pc_transaccional <- data$pc_transaccional

names(data)[21] # dh_max_dia_entradas: Ultimo dia en que recibio alguna transaccion de ingreso de dinero
## [1] "dh_max_dia_entradas"
table(data$dh_max_dia_entradas)
## 
##   0   1   2   3   4   5   6   7   9  10  11  12  13  14  15  16  18  20  21  22 
## 291   2   3   1   2   1   2   1   1   1   1   1   4   6   2   1   5   2   2   5 
##  23  24  25  26  27  28  29  30  31 
##  11   8  19   8  23  78  24 145 252
hist(data$dh_max_dia_entradas) # se volverá binaria

x <- ifelse(data$dh_max_dia_entradas <= 15, 1, 0)
table(x)
## x
##   0   1 
## 583 319
flag_ultima_entrada_1quincena <- x

names(data)[22] # pc_cupo_entidad: Cupo de las tarjetas de crédito en el banco
## [1] "pc_cupo_entidad"
summary(data$pc_cupo_entidad)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0    21508        0 17000000
hist(data$pc_cupo_entidad)

table(ifelse(data$pc_cupo_entidad >= 1, 1, 0))
## 
##   0   1 
## 899   3
names(data)[23] # pc_cuotas_como_ppal: Cuotas pagadas como principal
## [1] "pc_cuotas_como_ppal"
summary(data$pc_cuotas_como_ppal)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   18017       0 6103800
hist(data$pc_cuotas_como_ppal)

names(data)[24] # dh_val_entradas: Valor total de los ingresos tomados en el mes anterior
## [1] "dh_val_entradas"
summary(data$dh_val_entradas)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   5685000  17657538  25060713 181987436
hist(data$dh_val_entradas)

hist(sqrt(data$dh_val_entradas))

dh_val_entradas <- sqrt(data$dh_val_entradas)

names(data)[25] # pc_cant_moras_90_ult_12_meses: Cantidad de moras 90 o superiores en los últimos 12 meses = 0 o vacío.
## [1] "pc_cant_moras_90_ult_12_meses"
table(data$pc_cant_moras_90_ult_12_meses)
## 
##   0   1   2   3   4   5   6   7  12 
## 814  27  22   7   4  13   3  11   1
hist(data$pc_cant_moras_90_ult_12_meses)

hist(sqrt(data$pc_cant_moras_90_ult_12_meses))

names(data)[26] # dh_max_dia_salidas
## [1] "dh_max_dia_salidas"
# prefiero quedarme con la misma variable pero para el mes actual

names(data)[27] # pc_cant_moras_60_ult_3_meses: Cantidad de moras 60 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_60_ult_3_meses"
table(data$pc_cant_moras_60_ult_3_meses)
## 
##   0   1   2   3 
## 800  33  24  45
table(ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0))
## 
##   0   1 
## 800 102
flag_tuvo_mora60_ult3meses <- ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0)

names(data)[28] # pc_cuota_tarjeta_de_credito: Cuota de tarjeta de crédito  reportada por CIFIN
## [1] "pc_cuota_tarjeta_de_credito"
summary(data$pc_cuota_tarjeta_de_credito)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    9882       0 4726800
hist(data$pc_cuota_tarjeta_de_credito)

table(ifelse(data$pc_cuota_tarjeta_de_credito >= 1, 1, 0))
## 
##   0   1 
## 897   5
names(data)[29] # cp_inicial_menos_saldo: valor inicial menos el saldo en el mes anterior por producto
## [1] "cp_inicial_menos_saldo"
summary(data$cp_inicial_menos_saldo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[30] # pc_peor_estado_act_cta_aho: Peor estado cuenta ahorro
## [1] "pc_peor_estado_act_cta_aho"
summary(data$pc_peor_estado_act_cta_aho)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0133  0.0000  3.0000
hist(data$pc_peor_estado_act_cta_aho)

names(data)[31] # dia_pago: Dia de pago de la obligacion en el mes
## [1] "dia_pago"
table(data$dia_pago)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  38  12 198  11  13   8  12   9  14  13  19   9  15  11 246  17 103  26   9  10 
##  21  22  23  24  25  26  27  28  29  30  31 
##   5  10  10  14   8  14   9   4  17  17   1
x <- ifelse(data$dia_pago <= 15, 1, 0)
table(x)
## x
##   0   1 
## 274 628
flag_diaPago_1quincena <- x

names(data)[32] # cp_cuotas_falta: Cantidad de cuotas faltantes
## [1] "cp_cuotas_falta"
table(data$cp_cuotas_falta)
## 
##   0 
## 902
names(data)[33] # pcons_tarjeta_de_credito: Es el producto una tarjeta de credito
## [1] "pcons_tarjeta_de_credito"
table(data$pcons_tarjeta_de_credito)
## 
##   0   1 
## 607 295
pcons_tarjeta_de_credito <- data$pcons_tarjeta_de_credito

names(data)[34] # pc_cifin: Ingreso del cliente de acuerdo a CIFIN
## [1] "pc_cifin"
summary(data$pc_cifin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   21787       0 7505630
hist(data$pc_cifin)

names(data)[35] # gestiones_prod: Numero de gestiones productivas
## [1] "gestiones_prod"
table(data$gestiones_prod)
## 
##   0   1 
## 901   1
names(data)[36] # pcons_vehiculos_sufi: Es el producto un vehiculo sufi
## [1] "pcons_vehiculos_sufi"
table(data$pcons_vehiculos_sufi)
## 
##   0   1 
## 836  66
names(data)[37] # cluster_recod: segmento cluster
## [1] "cluster_recod"
table(data$cluster_recod)
## 
##   1   6   7  13  16  19 
##   8 458 246  82  70  38
x <- ifelse(data$cluster_recod == 6, 1, 0)
table(x)
## x
##   0   1 
## 444 458
flag_es_cluster_6 <- x

names(data)[38] # dh_avg_dia_retiros_d: Dia promedio del mes en el que realiza los retiros 
## [1] "dh_avg_dia_retiros_d"
hist(data$dh_avg_dia_retiros_d)

dh_avg_dia_retiros_d <- data$dh_avg_dia_retiros_d

names(data)[39] # dmi_max_egreso_diario: Maximo egreso en un dia del mes anterior 
## [1] "dmi_max_egreso_diario"
summary(data$dmi_max_egreso_diario)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0  2509288  5687549  8127384 82421516
hist(data$dmi_max_egreso_diario)

quantile(data$dmi_max_egreso_diario, seq(0,1,by = 0.01))
##          0%          1%          2%          3%          4%          5% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         18%         19%         20%         21%         22%         23% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         24%         25%         26%         27%         28%         29% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         30%         31%         32%         33%         34%         35% 
##        0.00        0.00        0.00        0.00        0.00    11399.35 
##         36%         37%         38%         39%         40%         41% 
##    57635.56    76406.00   282165.00   546344.40   848246.40  1120309.30 
##         42%         43%         44%         45%         46%         47% 
##  1278535.00  1506000.00  1586002.00  1800000.00  1808016.60  2005671.00 
##         48%         49%         50%         51%         52%         53% 
##  2071476.36  2412097.00  2509287.50  2788249.00  2999088.00  3012780.00 
##         54%         55%         56%         57%         58%         59% 
##  3329386.00  3514000.00  3614412.00  3790122.86  3917983.00  3993058.06 
##         60%         61%         62%         63%         64%         65% 
##  4314267.60  4329499.00  4760574.00  5179535.80  5434157.40  5609550.35 
##         66%         67%         68%         69%         70%         71% 
##  6002005.00  6191910.48  6386272.00  6571371.65  6966758.20  7028000.00 
##         72%         73%         74%         75%         76%         77% 
##  7321076.92  7619654.00  7970963.00  8127383.75  8774232.00  8875431.00 
##         78%         79%         80%         81%         82%         83% 
##  9546222.00  9937420.51 10054365.60 10794511.00 11621264.60 11949578.85 
##         84%         85%         86%         87%         88%         89% 
## 12373146.00 13004433.00 13213167.00 13285191.00 14333466.00 15038382.25 
##         90%         91%         92%         93%         94%         95% 
## 15383235.00 15568797.00 16518175.44 18115766.45 20069227.10 20599199.10 
##         96%         97%         98%         99%        100% 
## 22645335.40 24225138.55 31233119.30 35903859.00 82421516.00
x <- subset(data$dmi_max_egreso_diario, data$dmi_max_egreso_diario <= 85362328)
hist(x)

dmi_max_egreso_diario <- data$dmi_max_egreso_diario

names(data)[40] # cpc_max_proc_deuda: Máximo del porcentaje de la deuda en el mes anterior
## [1] "cpc_max_proc_deuda"
summary(data$cpc_max_proc_deuda)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -28.182000   0.000000   0.003515   0.158809   0.847877   1.272912
names(data)[41] # dh_cant_otros_d: Cantidad de trasacciones de salida por concepto de otros
## [1] "dh_cant_otros_d"
summary(data$dh_cant_otros_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   3.204   5.000  28.000
hist(data$dh_cant_otros_d)

hist(sqrt(data$dh_cant_otros_d))

dh_cant_otros_d <- sqrt(data$dh_cant_otros_d)

names(data)[42] # pc_cont_30_lt_12m_tot_sf: Cantidad mora 30 últimos 12 meses sector financiero
## [1] "pc_cont_30_lt_12m_tot_sf"
summary(data$pc_cont_30_lt_12m_tot_sf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
hist(data$pc_cont_30_lt_12m_tot_sf)

table(data$pc_cont_30_lt_12m_tot_sf)
## 
##   0 
## 902
names(data)[43] # pc_cant_mora90_ult_12m_total
## [1] "pc_cant_mora90_ult_12m_total"
table(data$pc_cant_mora90_ult_12m_total)
## 
##   0   1   2   3   4   5   6   7  12 
## 856  16   2   1   4   8   3  11   1
table(ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0))
## 
##   0   1 
## 856  46
x <- ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0)
flag_tuvo_mora90_ult12M <- x

names(data)[44] # dmi_max_ingreso_diario: Maximo ingreso en un dia del mes anterior 
## [1] "dmi_max_ingreso_diario"
summary(data$dmi_max_ingreso_diario)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0  2534003  6832447 10000000 99410001
hist(data$dmi_max_ingreso_diario)

quantile(data$dmi_max_ingreso_diario, seq(0, 1, by = 0.01))
##          0%          1%          2%          3%          4%          5% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         18%         19%         20%         21%         22%         23% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         24%         25%         26%         27%         28%         29% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         30%         31%         32%         33%         34%         35% 
##        0.00        0.00        0.00        0.00        0.00        2.35 
##         36%         37%         38%         39%         40%         41% 
##      309.00      615.78    50000.00   354796.22   835472.40  1000000.00 
##         42%         43%         44%         45%         46%         47% 
##  1200000.00  1485150.36  1648761.12  1944331.70  2045478.26  2123515.51 
##         48%         49%         50%         51%         52%         53% 
##  2337440.00  2402294.67  2534003.00  2984507.57  3000006.56  3215770.00 
##         54%         55%         56%         57%         58%         59% 
##  3460925.00  3622502.05  3843849.28  3900211.00  4207500.00  4407615.09 
##         60%         61%         62%         63%         64%         65% 
##  4651730.20  5067101.17  5500000.00  5728921.55  6019212.60  6209045.40 
##         66%         67%         68%         69%         70%         71% 
##  6399660.00  6875000.00  7000000.00  7200000.00  7348904.00  8000000.00 
##         72%         73%         74%         75%         76%         77% 
##  8378862.00  9052866.00  9986436.52 10000000.00 10000000.00 10000000.00 
##         78%         79%         80%         81%         82%         83% 
## 10257019.00 10638016.52 11216000.00 11974437.86 13164000.00 13445811.23 
##         84%         85%         86%         87%         88%         89% 
## 13700001.00 14357715.70 14950272.84 15830394.00 17779183.08 18490095.00 
##         90%         91%         92%         93%         94%         95% 
## 19175443.00 19495802.00 20801484.44 23791026.00 25000000.00 26902125.00 
##         96%         97%         98%         99%        100% 
## 33287967.36 35678300.00 39273879.36 41724001.00 99410001.00
x <- subset(data$dmi_max_ingreso_diario, data$dmi_max_ingreso_diario <= 104909227.3)
hist(x)

dmi_max_ingreso_diario <- data$dmi_max_ingreso_diario

names(data)[45] # dh_val_otros_d: Valor total de las trasacciones de salida por concepto de otros en un mes
## [1] "dh_val_otros_d"
summary(data$dh_val_otros_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0   12586  802098  697294 9777278
hist(data$dh_val_otros_d)

quantile(data$dh_val_otros_d, seq(0,1,by=0.01))
##         0%         1%         2%         3%         4%         5%         6% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##         7%         8%         9%        10%        11%        12%        13% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        14%        15%        16%        17%        18%        19%        20% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        21%        22%        23%        24%        25%        26%        27% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        28%        29%        30%        31%        32%        33%        34% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        35%        36%        37%        38%        39%        40%        41% 
##       0.00       0.00       0.00       0.00       0.00       0.00       0.00 
##        42%        43%        44%        45%        46%        47%        48% 
##       0.00       0.00       0.00       0.00       0.00    1610.00    2756.40 
##        49%        50%        51%        52%        53%        54%        55% 
##   10100.00   12586.00   13271.00   19601.04   20200.00   28692.00   38891.20 
##        56%        57%        58%        59%        60%        61%        62% 
##   49211.00   56686.15   60420.00   68691.68   76824.00  119253.19  133905.00 
##        63%        64%        65%        66%        67%        68%        69% 
##  174475.03  201200.00  233389.00  271140.00  309311.93  332894.00  349316.00 
##        70%        71%        72%        73%        74%        75%        76% 
##  407576.50  452845.97  483246.00  513799.14  596183.08  697293.75  713901.68 
##        77%        78%        79%        80%        81%        82%        83% 
##  827700.00  928996.18 1033514.61 1102974.80 1127520.00 1271960.24 1384444.00 
##        84%        85%        86%        87%        88%        89%        90% 
## 1648990.52 1716832.70 1872117.00 2066960.98 2388746.72 2564671.00 2652146.00 
##        91%        92%        93%        94%        95%        96%        97% 
## 2956094.31 3320259.56 3663815.21 4115774.00 5073191.50 5825378.92 6362258.37 
##        98%        99%       100% 
## 7211642.08 7645053.36 9777278.00
x <- subset(data$dh_val_otros_d, data$dh_val_otros_d <= 9998335.80)
hist(x)

dh_val_otros_d <- data$dh_val_otros_d

names(data)[46] # pc_ingreso_final: Ingreso de final del cliente
## [1] "pc_ingreso_final"
summary(data$pc_ingreso_final)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  3315000  8362472 14053377 18708862 87552251
hist(data$pc_ingreso_final)

pc_ingreso_final <- data$pc_ingreso_final

names(data)[47] # dh_cant_pagos_d: Cantidad de pagos de salidas tuvo en el mes anterior
## [1] "dh_cant_pagos_d"
summary(data$dh_cant_pagos_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   3.500   5.346   8.000  35.000
hist(data$dh_cant_pagos_d)

dh_cant_pagos_d <- data$dh_cant_pagos_d

names(data)[48] # dmi_ingreso_total_mes: Ingreso total del mes anterior
## [1] "dmi_ingreso_total_mes"
summary(data$dmi_ingreso_total_mes)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   5685000  17657538  25060713 181987436
hist(data$dmi_ingreso_total_mes)

quantile(data$dmi_ingreso_total_mes, seq(0, 1, by = 0.01))
##           0%           1%           2%           3%           4%           5% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##           6%           7%           8%           9%          10%          11% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          12%          13%          14%          15%          16%          17% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          18%          19%          20%          21%          22%          23% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          24%          25%          26%          27%          28%          29% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          30%          31%          32%          33%          34%          35% 
##         0.00         0.00         0.00         0.00         0.00         3.70 
##          36%          37%          38%          39%          40%          41% 
##       317.28      1592.74     53032.00    520933.81   1001004.40   1724907.50 
##          42%          43%          44%          45%          46%          47% 
##   2240001.00   2520715.00   3185149.60   3571411.35   4411157.02   4817541.00 
##          48%          49%          50%          51%          52%          53% 
##   5160000.00   5461628.03   5685000.00   6531800.00   7248359.04   7793376.50 
##          54%          55%          56%          57%          58%          59% 
##   8270000.00   8530406.00   9188379.16   9658108.00  10416391.60  11189637.61 
##          60%          61%          62%          63%          64%          65% 
##  11948978.60  12418588.01  13000000.00  13673026.68  14513000.00  15067301.05 
##          66%          67%          68%          69%          70%          71% 
##  16128330.00  16748600.00  18163873.72  18547426.92  20132597.40  21500000.00 
##          72%          73%          74%          75%          76%          77% 
##  22000000.00  22528017.00  23377940.20  25060713.00  27384078.44  28400354.79 
##          78%          79%          80%          81%          82%          83% 
##  30557407.96  31241943.45  31844647.40  31944284.00  34174945.00  36977650.33 
##          84%          85%          86%          87%          88%          89% 
##  39152130.00  43148753.00  46185407.00  48841121.03  52888577.60  53816089.00 
##          90%          91%          92%          93%          94%          95% 
##  56323174.00  60959669.06  66736925.00  67675419.00  68100000.00  71880165.00 
##          96%          97%          98%          99%         100% 
##  73308273.00  79961357.24  88862497.06 109768839.74 181987436.00
x <- subset(data$dmi_ingreso_total_mes, data$dmi_ingreso_total_mes <= 205163705.0)
hist(x)

dmi_ingreso_total_mes <- data$dmi_ingreso_total_mes

names(data)[49] # dh_val_pagos_d: Valor total de las trasacciones de salida por concepto de pagos otros
## [1] "dh_val_pagos_d"
summary(data$dh_val_pagos_d)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0   537640  3639828  4661394 46719192
hist(data$dh_val_pagos_d)

quantile(data$dh_val_pagos_d, seq(0, 1, by = 0.01))
##          0%          1%          2%          3%          4%          5% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         18%         19%         20%         21%         22%         23% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         24%         25%         26%         27%         28%         29% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         30%         31%         32%         33%         34%         35% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         36%         37%         38%         39%         40%         41% 
##        0.00        0.00       75.34    42343.00    56773.00    69144.00 
##         42%         43%         44%         45%         46%         47% 
##    91390.00   151658.00   159690.00   165118.55   178014.00   179314.00 
##         48%         49%         50%         51%         52%         53% 
##   306936.36   318125.77   537640.00   678424.39   806356.20  1059441.20 
##         54%         55%         56%         57%         58%         59% 
##  1212291.94  1330827.25  1560711.00  1662900.00  1823001.70  1999493.24 
##         60%         61%         62%         63%         64%         65% 
##  2085516.80  2220129.00  2315083.28  2490672.54  2778419.00  3040516.20 
##         66%         67%         68%         69%         70%         71% 
##  3277671.00  3344574.00  3409709.20  3536084.43  3600516.00  3766572.79 
##         72%         73%         74%         75%         76%         77% 
##  3900425.24  4283458.20  4490363.42  4661393.75  4973548.56  5176261.00 
##         78%         79%         80%         81%         82%         83% 
##  5271188.86  5399363.49  5707806.00  6166031.00  6853466.00  7095437.00 
##         84%         85%         86%         87%         88%         89% 
##  7265797.40  7453799.00  7733374.12  8210036.00  9130377.00  9750499.50 
##         90%         91%         92%         93%         94%         95% 
## 10373706.00 10980804.97 12745205.00 13360645.10 13522747.00 14262631.00 
##         96%         97%         98%         99%        100% 
## 15040059.56 21305241.00 27212693.58 36549357.01 46719192.00
x <- subset(data$dh_val_pagos_d, data$dh_val_pagos_d <= 50357360.00)
hist(x)

dh_val_pagos_d <- data$dh_val_pagos_d

names(data)[50] # pc_gasto_familiar: Valor Gasto de familiar del cliente
## [1] "pc_gasto_familiar"
summary(data$pc_gasto_familiar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0   998131  2284566  3596369  4568750 21888063
quantile(data$pc_gasto_familiar, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00    63033.82   350000.00   491062.78   595099.36 
##         18%         19%         20%         21%         22%         23% 
##   665194.70   765000.00   829934.46   844698.40   864358.37   905320.38 
##         24%         25%         26%         27%         28%         29% 
##   943500.00   998130.70  1053864.93  1086803.67  1107646.36  1170000.00 
##         30%         31%         32%         33%         34%         35% 
##  1235790.67  1307972.22  1401226.17  1435756.50  1449533.38  1564995.31 
##         36%         37%         38%         39%         40%         41% 
##  1584240.00  1629589.42  1630082.22  1637151.00  1654248.75  1680000.00 
##         42%         43%         44%         45%         46%         47% 
##  1710290.10  1732919.27  1773123.19  1800533.81  1852381.20  2042659.37 
##         48%         49%         50%         51%         52%         53% 
##  2118915.12  2232519.95  2284565.98  2314858.89  2397248.78  2446335.13 
##         54%         55%         56%         57%         58%         59% 
##  2510217.45  2530766.61  2568167.58  2705762.50  2708752.54  2759275.31 
##         60%         61%         62%         63%         64%         65% 
##  2778750.00  2853154.49  2990963.55  3240028.70  3262926.88  3304375.00 
##         66%         67%         68%         69%         70%         71% 
##  3400000.00  3557347.50  3738034.50  3850755.50  3981578.82  4232305.25 
##         72%         73%         74%         75%         76%         77% 
##  4258569.80  4349935.73  4473926.14  4568750.00  4635750.00  4665646.00 
##         78%         79%         80%         81%         82%         83% 
##  5079723.23  5154531.56  5214205.69  5296256.50  5296256.50  5660150.00 
##         84%         85%         86%         87%         88%         89% 
##  5976224.40  6430000.00  7311417.07  7509873.84  8004993.21  8217756.65 
##         90%         91%         92%         93%         94%         95% 
##  9000354.71  9106196.62  9403175.00 10619436.29 12488403.37 13932843.75 
##         96%         97%         98%         99%        100% 
## 14228463.97 15245233.01 19531282.01 19549158.17 21888062.75
x <- subset(data$pc_gasto_familiar, data$pc_gasto_familiar <= 26582180.19)
hist(x)

pc_gasto_familiar <- data$pc_gasto_familiar

names(data)[51] # pc_cuotas_pagadas: Valor Cuotas de pagadas del cliente
## [1] "pc_cuotas_pagadas"
summary(data$pc_cuotas_pagadas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0  343148  913284 1373781 7184545
hist(data$pc_cuotas_pagadas)

quantile(data$pc_cuotas_pagadas, seq(0, 1, by = 0.01))
##        0%        1%        2%        3%        4%        5%        6%        7% 
##       0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0 
##        8%        9%       10%       11%       12%       13%       14%       15% 
##       0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0 
##       16%       17%       18%       19%       20%       21%       22%       23% 
##       0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0 
##       24%       25%       26%       27%       28%       29%       30%       31% 
##       0.0       0.0       0.0       0.0     130.0   62150.0   87000.0  116000.0 
##       32%       33%       34%       35%       36%       37%       38%       39% 
##  143028.0  145000.0  179265.8  213500.0  220000.0  242425.2  258000.0  258000.0 
##       40%       41%       42%       43%       44%       45%       46%       47% 
##  258000.0  258000.0  258000.0  275855.6  280000.0  286810.0  294600.0  300000.0 
##       48%       49%       50%       51%       52%       53%       54%       55% 
##  332317.2  333200.0  343148.2  364200.7  384200.0  384698.3  400000.0  429782.6 
##       56%       57%       58%       59%       60%       61%       62%       63% 
##  473000.0  490000.0  504164.1  516000.0  543636.1  543636.1  646224.7  648815.1 
##       64%       65%       66%       67%       68%       69%       70%       71% 
##  676982.9  806622.1  847182.0  847182.0  855700.0  860000.0  874639.5  928418.7 
##       72%       73%       74%       75%       76%       77%       78%       79% 
## 1065817.2 1120496.9 1354695.3 1373781.0 1484196.6 1503863.1 1730412.1 1737940.1 
##       80%       81%       82%       83%       84%       85%       86%       87% 
## 1769838.0 1808000.0 1840035.3 1855943.0 1885749.1 1891250.5 2117686.3 2132206.4 
##       88%       89%       90%       91%       92%       93%       94%       95% 
## 2283470.0 2451854.4 2492565.3 2493312.4 2865653.8 2983499.5 3017035.4 3234839.9 
##       96%       97%       98%       99%      100% 
## 3303345.8 4616921.3 5167146.6 7155587.2 7184544.5
x <- subset(data$pc_cuotas_pagadas, data$pc_cuotas_pagadas <= 8156850.0)
hist(x)

pc_cuotas_pagadas <- data$pc_cuotas_pagadas

names(data)[52] # cpc_avg_proc_deuda: Promedio del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_avg_proc_deuda"
summary(data$cpc_avg_proc_deuda)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -28.182000   0.000000   0.003075   0.136219   0.799321   1.224909
hist(data$cpc_avg_proc_deuda)

quantile(data$cpc_avg_proc_deuda, seq(0,1,by=0.01))
##            0%            1%            2%            3%            4% 
## -28.182000000  -0.021213000  -0.003272055   0.000000000   0.000000000 
##            5%            6%            7%            8%            9% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           10%           11%           12%           13%           14% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           15%           16%           17%           18%           19% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           20%           21%           22%           23%           24% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           25%           26%           27%           28%           29% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           30%           31%           32%           33%           34% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           35%           36%           37%           38%           39% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           40%           41%           42%           43%           44% 
##   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000 
##           45%           46%           47%           48%           49% 
##   0.000000000   0.002474419   0.002631783   0.002631783   0.002631783 
##           50%           51%           52%           53%           54% 
##   0.003075038   0.018633346   0.059156000   0.096896600   0.115248053 
##           55%           56%           57%           58%           59% 
##   0.168734000   0.215616247   0.246319000   0.277754880   0.312850843 
##           60%           61%           62%           63%           64% 
##   0.362930164   0.402114125   0.427537092   0.442012067   0.478433000 
##           65%           66%           67%           68%           69% 
##   0.545081738   0.582370425   0.594232417   0.640208326   0.684193917 
##           70%           71%           72%           73%           74% 
##   0.700694736   0.746537701   0.757732590   0.776391909   0.784705667 
##           75%           76%           77%           78%           79% 
##   0.799320623   0.824395833   0.830392909   0.834281000   0.842780631 
##           80%           81%           82%           83%           84% 
##   0.853675575   0.858839117   0.868951947   0.895844211   0.929755381 
##           85%           86%           87%           88%           89% 
##   0.962535958   0.966394034   0.973559686   0.980652725   0.990590504 
##           90%           91%           92%           93%           94% 
##   1.017928827   1.045238187   1.072557625   1.085973066   1.098219571 
##           95%           96%           97%           98%           99% 
##   1.099494833   1.111387925   1.119784114   1.149961810   1.193426598 
##          100% 
##   1.224909400
names(data)[53] # cpc_sum_proc_deuda: Suma del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_sum_proc_deuda"
summary(data$cpc_sum_proc_deuda)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -28.182000   0.000000   0.003075   0.136219   0.799321   1.224909
hist(data$cpc_sum_proc_deuda)

names(data)[54] # dc_porc_prod_sin_mora: Porcentaje de productos sin mora en todo el sistema
## [1] "dc_porc_prod_sin_mora"
summary(data$dc_porc_prod_sin_mora)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.005543 0.000000 1.000000
hist(data$dc_porc_prod_sin_mora)

quantile(data$dc_porc_prod_sin_mora, seq(0,1,by=0.01))
##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  96%  97%  98%  99% 100% 
##    0    0    0    0    1
names(data)[55] # pc_ingreso_rutina_con_techo: Ingreso por rutina una vez aplicada los techos por segmento
## [1] "pc_ingreso_rutina_con_techo"
summary(data$pc_ingreso_rutina_con_techo)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  3550276  8811787 15065481 20470286 87552251
hist(data$pc_ingreso_rutina_con_techo)

pc_ingreso_rutina_con_techo <- data$pc_ingreso_rutina_con_techo

names(data)[56] # pc_saldo_prom3_tdc_entidad: Saldo promedio de los últimos 3 meses de tarjeta de crédito en el banco
## [1] "pc_saldo_prom3_tdc_entidad"
summary(data$pc_saldo_prom3_tdc_entidad)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    5652       0 3070000
hist(data$pc_saldo_prom3_tdc_entidad)

names(data)[57] # dh_cant_salidas: Cantidad de trasacciones de salida de dinero en un mes
## [1] "dh_cant_salidas"
summary(data$dh_cant_salidas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   19.00   28.66   49.00  184.00
hist(data$dh_cant_salidas)

dh_cant_salidas <- data$dh_cant_salidas

names(data)[58] # dh_min_dia_pagos_d: Primer dia en el que realizó algún credito en el mes anterior
## [1] "dh_min_dia_pagos_d"
summary(data$dh_min_dia_pagos_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   3.171   3.000  31.000
hist(data$dh_min_dia_pagos_d)

dh_min_dia_pagos_d <- data$dh_min_dia_pagos_d

names(data)[59] # pc_ingreso_por_rutina: Ingreso por rutina
## [1] "pc_ingreso_por_rutina"
summary(data$pc_ingreso_por_rutina)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0  3550276  8811787 15083531 20470286 87552251
hist(data$pc_ingreso_por_rutina)

pc_ingreso_por_rutina <- data$pc_ingreso_por_rutina

names(data)[60] # dh_min_dia_pago_tarj_d: Primer dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_min_dia_pago_tarj_d"
hist(data$dh_min_dia_pago_tarj_d)

names(data)[61] # cp_nro_cuota: Numero de cuota pactadas por producto
## [1] "cp_nro_cuota"
summary(data$cp_nro_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[62] # dmi_egreso_total_mes: Egreso total del mes anterior
## [1] "dmi_egreso_total_mes"
summary(data$dmi_egreso_total_mes)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   6710852  18203268  24720956 191314996
hist(data$dmi_egreso_total_mes)

quantile(data$dmi_egreso_total_mes, seq(0,1,by=0.01))
##           0%           1%           2%           3%           4%           5% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##           6%           7%           8%           9%          10%          11% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          12%          13%          14%          15%          16%          17% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          18%          19%          20%          21%          22%          23% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          24%          25%          26%          27%          28%          29% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          30%          31%          32%          33%          34%          35% 
##         0.00         0.00         0.00         0.00         0.00     11399.35 
##          36%          37%          38%          39%          40%          41% 
##     65750.00    144062.00    476365.00    857438.55   1373997.80   2032524.46 
##          42%          43%          44%          45%          46%          47% 
##   2425098.10   3147436.28   3637537.36   4297695.00   4985629.00   5242864.00 
##          48%          49%          50%          51%          52%          53% 
##   5768836.60   6471482.00   6710852.00   7234899.00   7941029.00   8321595.04 
##          54%          55%          56%          57%          58%          59% 
##   9066380.48   9451729.00  10147091.00  11474139.00  11927600.00  12472455.70 
##          60%          61%          62%          63%          64%          65% 
##  13236143.00  13635595.00  14447329.22  15199359.70  16050052.32  16698889.00 
##          66%          67%          68%          69%          70%          71% 
##  17675826.00  18687460.99  19390695.00  20624901.00  21419482.00  21608609.00 
##          72%          73%          74%          75%          76%          77% 
##  22283315.00  23681242.17  23844136.20  24720956.00  25905897.48  27633120.00 
##          78%          79%          80%          81%          82%          83% 
##  29004586.56  30127755.34  31398939.00  31492755.00  32045289.72  35278365.00 
##          84%          85%          86%          87%          88%          89% 
##  40680145.00  43554385.50  45389761.66  48031045.00  52135165.00  56926454.85 
##          90%          91%          92%          93%          94%          95% 
##  60640219.00  64217571.00  66239075.00  68137836.00  71343296.66  78399975.45 
##          96%          97%          98%          99%         100% 
##  80045627.00  80352549.82  89004556.92 111049163.72 191314996.00
x <- subset(data$dmi_egreso_total_mes, data$dmi_egreso_total_mes <= 209829245.0)
hist(x)

dmi_egreso_total_mes <- data$dmi_egreso_total_mes

names(data)[63] # cp_valor_inicial: valor inicial de la obligacion por producto
## [1] "cp_valor_inicial"
summary(data$cp_valor_inicial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[64] # dh_max_dia_otros_d: Ultimo dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_max_dia_otros_d"
hist(data$dh_max_dia_otros_d)

x <- ifelse(data$dh_max_dia_otros_d <= 15, 1, 0)
table(x)
## x
##   0   1 
## 387 515
flax_maxDiaOtrosD_1quincena <- x

names(data)[65] # p_cuota_sobre_saldo: Valor de la cuota sobre saldo por producto
## [1] "cp_cuota_sobre_saldo"
summary(data$cp_cuota_sobre_saldo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[66] # dc_sum_valor_inicial: Suma valores iniciales de obligaciones en todo el sistema financiero
## [1] "dc_sum_valor_inicial"
summary(data$dc_sum_valor_inicial)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0         0    501971         0 310276000
hist(data$dc_sum_valor_inicial)

names(data)[67] # dh_cant_pago_tarj_d: Cantidad de trasacciones de salida por concepto de pago de tarjeta de credito
## [1] "dh_cant_pago_tarj_d"
summary(data$dh_cant_pago_tarj_d)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.9678  1.0000 33.0000
hist(data$dh_cant_pago_tarj_d)

names(data)[68] # dh_max_dia_pagos_d: Ultimo dia en el que realizó algún credito en el mes anterior
## [1] "dh_max_dia_pagos_d"
hist(data$dh_max_dia_pagos_d)

x <- ifelse(data$dh_max_dia_pagos_d <= 15, 1, 0)
table(x)
## x
##   0   1 
## 505 397
flag_maxDiaPagos_d_1quincena <- x

names(data)[69] # cp_saldo_sobre_inicial: valor del saldo sobre inicial por producto mes anterior
## [1] "cp_saldo_sobre_inicial"
summary(data$cp_saldo_sobre_inicial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[70] # pc_mediana_nom3: Mediana de los últimos 3 pagos nómina  para cálculo de retanqueo libranza
## [1] "pc_mediana_nom3"
summary(data$pc_mediana_nom3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   50084       0 4287978
hist(data$pc_mediana_nom3)

names(data)[71] # cp_esta_cuota_otro: Es el estado de la cuota otro
## [1] "cp_esta_cuota_otro"
summary(data$cp_esta_cuota_otro)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[72] # dh_max_dia_retiros_d: Ultimo dia en el que realizó reitro en el mes anterior
## [1] "dh_max_dia_retiros_d"
hist(data$dh_max_dia_retiros_d)

# variable incorrecta, el cuartil 3 es 30 y un mes tiene hasta 31 días

names(data)[73] # dh_avg_dia_entradas: Dia promedio del mes en el recibe las entradas de dinero
## [1] "dh_avg_dia_entradas"
summary(data$dh_avg_dia_entradas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   4.585   4.457   6.557  31.000
hist(data$dh_avg_dia_entradas)

names(data)[74] # dh_avg_dia_pagos_d: Dia del mes promedio en el que hace las salidas de dinero por pagos
## [1] "dh_avg_dia_pagos_d"
x <- ifelse(data$dh_avg_dia_pagos_d <= 10, 1, 0)
table(x)
## x
##   0   1 
##   1 901
names(data)[75] # dh_val_salidas: Valor total de las trasacciones de salida en un mes
## [1] "dh_val_salidas"
summary(data$dh_val_salidas)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   6710852  18220966  24720956 191314996
hist(data$dh_val_salidas)

quantile(data$dh_val_salidas, seq(0,1,by=0.01))
##           0%           1%           2%           3%           4%           5% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##           6%           7%           8%           9%          10%          11% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          12%          13%          14%          15%          16%          17% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          18%          19%          20%          21%          22%          23% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          24%          25%          26%          27%          28%          29% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          30%          31%          32%          33%          34%          35% 
##         0.00         0.00         0.00         0.00     11399.34     38171.30 
##          36%          37%          38%          39%          40%          41% 
##     97964.00    275655.54    524117.36   1168205.09   1781441.00   2117572.83 
##          42%          43%          44%          45%          46%          47% 
##   2514844.00   3349365.00   3817628.00   4412149.25   5013252.92   5242864.00 
##          48%          49%          50%          51%          52%          53% 
##   5768836.60   6471482.00   6710852.00   7234899.00   7941029.00   8321595.04 
##          54%          55%          56%          57%          58%          59% 
##   9066380.48   9451729.00  10147091.00  11474139.00  11927600.00  12472455.70 
##          60%          61%          62%          63%          64%          65% 
##  13236143.00  13635595.00  14447329.22  15199359.70  16050052.32  16698889.00 
##          66%          67%          68%          69%          70%          71% 
##  17675826.00  18687460.99  19390695.00  20624901.00  21419482.00  21608609.00 
##          72%          73%          74%          75%          76%          77% 
##  22283315.00  23681242.17  23844136.20  24720956.00  25905897.48  27633120.00 
##          78%          79%          80%          81%          82%          83% 
##  29004586.56  30127755.34  31398939.00  31492755.00  32045289.72  35278365.00 
##          84%          85%          86%          87%          88%          89% 
##  40680145.00  43554385.50  45389761.66  48031045.00  52135165.00  56926454.85 
##          90%          91%          92%          93%          94%          95% 
##  60640219.00  64217571.00  66239075.00  68137836.00  71343296.66  78399975.45 
##          96%          97%          98%          99%         100% 
##  80045627.00  80352549.82  89004556.92 111049163.72 191314996.00
x <- subset(data$dh_val_salidas, data$dh_val_salidas <= 217011100)
hist(x)

dh_val_salidas <- data$dh_val_salidas

names(data)[76] # dc_sum_valor_cuota: Valor total de las trasacciones de salida en un mes
## [1] "dc_sum_valor_cuota"
summary(data$dc_sum_valor_cuota)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0    33407        0 13360000
hist(data$dc_sum_valor_cuota)

names(data)[77] # dh_min_dia_tras_d: Primer dia en el que realizó pago de traslado en el mes anterior
## [1] "dh_min_dia_tras_d"
hist(data$dh_min_dia_tras_d)

x <- ifelse(data$dh_min_dia_tras_d <= 10, 1, 0)
table(x)
## x
##   0   1 
##  99 803
names(data)[78] # cp_porc_valorcuot_ing: Relación entre el valor de la cuota sobre los ingresos por producto
## [1] "cp_porc_valorcuot_ing"
summary(data$cp_porc_valorcuot_ing)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[79] # pc_ind_ajustado: Ingreso neto disponible  del cliente ajustado
## [1] "pc_ind_ajustado"
summary(data$pc_ind_ajustado)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4115281   994836  3591609  6805845  9008932 46552980
hist(data$pc_ind_ajustado)

quantile(data$pc_ind_ajustado, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
## -4115280.80 -1826179.99  -289405.70   -96562.97        0.00        0.00 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00        0.00        0.00        0.00    48003.75 
##         18%         19%         20%         21%         22%         23% 
##   109162.44   190428.75   246883.42   538385.69   648167.69   799010.24 
##         24%         25%         26%         27%         28%         29% 
##   859590.00   994835.77  1157424.69  1289365.08  1350754.47  1404675.00 
##         30%         31%         32%         33%         34%         35% 
##  1480053.89  1587621.36  1587621.36  1650235.75  1822835.97  1879208.30 
##         36%         37%         38%         39%         40%         41% 
##  1948169.40  2003917.50  2154537.00  2518881.75  2677500.00  2822090.15 
##         42%         43%         44%         45%         46%         47% 
##  2911943.92  3091390.95  3190972.67  3190972.67  3241722.38  3241722.38 
##         48%         49%         50%         51%         52%         53% 
##  3413106.65  3505538.87  3591609.21  3591609.21  3872384.69  4317499.00 
##         54%         55%         56%         57%         58%         59% 
##  4357215.99  4474653.75  4668131.63  4848009.69  4974054.87  5360090.10 
##         60%         61%         62%         63%         64%         65% 
##  5741412.68  5741412.68  5777751.94  5859048.42  5895596.49  6025889.71 
##         66%         67%         68%         69%         70%         71% 
##  6098398.45  6318411.44  6806462.97  7365699.76  7610175.00  7897292.18 
##         72%         73%         74%         75%         76%         77% 
##  8258805.03  8432540.03  8888342.46  9008932.31 10100269.49 10443540.70 
##         78%         79%         80%         81%         82%         83% 
## 11475000.00 11744826.48 11744826.48 11854694.27 12430296.87 13127593.73 
##         84%         85%         86%         87%         88%         89% 
## 13154771.40 14039078.44 14122015.13 15690171.27 15790025.89 18134416.04 
##         90%         91%         92%         93%         94%         95% 
## 18941384.29 21621182.46 22345186.29 23339886.01 23429767.22 24527785.16 
##         96%         97%         98%         99%        100% 
## 26203986.86 30567968.50 30946818.46 36547579.52 46552980.18
x <- subset(data$pc_ind_ajustado, data$pc_ind_ajustado <= 68107989.8)
hist(x)

names(data)[80] # dh_val_retiros_d: Valor total de las trasacciones de salida por concepto de retiros en un mes
## [1] "dh_val_retiros_d"
summary(data$dh_val_retiros_d)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0   1048890   8526310  10895497 164054718
hist(data$dh_val_retiros_d)

names(data)[81] # pc_tiem_lt_prod_abie_total: Tiempo último producto abierto
## [1] "pc_tiem_lt_prod_abie_total"
summary(data$pc_tiem_lt_prod_abie_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03104 0.00000 9.00000
hist(data$pc_tiem_lt_prod_abie_total)

names(data)[82] # marca_info_cifin_decode: Marca Cifin(Consultado, no consultado, no encontrado, etc)
## [1] "marca_info_cifin_decode"
table(data$marca_info_cifin_decode)
## 
##   0   1   2 
## 645   5 252
x <- ifelse(data$marca_info_cifin_decode == 0, 1, 0)
table(x)
## x
##   0   1 
## 257 645
flag_encontrado_cifin <- x

names(data)[83] # dh_max_dia_pago_tarj_d: Ultimo dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_max_dia_pago_tarj_d"
hist(data$dh_max_dia_pago_tarj_d)

names(data)[84] # dc_valobli_ing: Suma del valor inicial de las obligaciones en el sistema financiero sobre los ingresos
## [1] "dc_valobli_ing"
summary(data$dc_valobli_ing)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.03364  0.00000 23.86188
hist(data$dc_valobli_ing)

names(data)[85] # pc_cantidad_tdc_entidad: Cantidad tarjetas de crédito en el banco
## [1] "pc_cantidad_tdc_entidad"
table(data$pc_cantidad_tdc_entidad)
## 
##   0   1   2 
## 899   2   1
names(data)[86] # dh_min_dia_otros_d: Primer dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_min_dia_otros_d"
hist(data$dh_min_dia_otros_d)

x <- ifelse(data$dh_min_dia_otros_d <= 15, 1, 0)
table(x)
## x
##   0   1 
##  65 837
names(data)[87] # dc_cant_obligaciones: Cantidad de obligaciones
## [1] "dc_cant_obligaciones"
table(data$dc_cant_obligaciones)
## 
##   0   3   8  10  15 
## 897   1   2   1   1
summary(data$dc_cant_obligaciones)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.04878  0.00000 15.00000
hist(data$dc_cant_obligaciones)

x <- ifelse(data$dc_cant_obligaciones >= 1, 1, 0)
table(x)
## x
##   0   1 
## 897   5
names(data)[88] # cpc_sum_nro_cuota: Suma de los número de cuotas de todas las obligaciones del cliente
## [1] "cpc_sum_nro_cuota"
summary(data$cpc_sum_nro_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0    25.5    93.3   134.0   778.0
names(data)[89] # cpc_avg_nro_cuota: Promedio del número de cuotas entre todos los productos del cliente
## [1] "cpc_avg_nro_cuota"
summary(data$cpc_avg_nro_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   20.33   39.96   66.38  305.50
hist(data$cpc_avg_nro_cuota)

cpc_avg_nro_cuota <- data$cpc_avg_nro_cuota

names(data)[90] # cpc_max_nro_cuota: Número de cuotas máximo de un cliente entre todos sus productos
## [1] "cpc_max_nro_cuota"
names(data)[91] # cp_saldo: Valor del saldo del mes anterior por producto
## [1] "cp_saldo"
summary(data$cp_saldo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[92] # cp_cuota_sobre_inicial: valor de la cuota sobre el valor desembolsado incial por producto
## [1] "cp_cuota_sobre_inicial"
summary(data$cp_cuota_sobre_inicial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[93] # cpc_sum_saldo: Suma del saldo de todos los productos activos del cliente
## [1] "cpc_sum_saldo"
summary(data$cpc_sum_saldo)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    -70455         0     33950  12216024   9920492 146717108
hist(data$cpc_sum_saldo)

quantile(data$cpc_sum_saldo, seq(0,1,by=0.01))
##           0%           1%           2%           3%           4%           5% 
##    -70455.00    -65826.00    -19656.58         0.00         0.00         0.00 
##           6%           7%           8%           9%          10%          11% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          12%          13%          14%          15%          16%          17% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          18%          19%          20%          21%          22%          23% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          24%          25%          26%          27%          28%          29% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          30%          31%          32%          33%          34%          35% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          36%          37%          38%          39%          40%          41% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          42%          43%          44%          45%          46%          47% 
##         0.00         0.00         0.00         0.00     24490.00     31920.00 
##          48%          49%          50%          51%          52%          53% 
##     33950.00     33950.00     33950.00    237879.40    472658.20    738957.00 
##          54%          55%          56%          57%          58%          59% 
##    985207.00   1420982.55   1946855.76   2051116.00   2437914.82   2604606.00 
##          60%          61%          62%          63%          64%          65% 
##   2704777.80   3033503.54   3199161.00   3792700.64   4372629.00   4874872.00 
##          66%          67%          68%          69%          70%          71% 
##   5131967.56   5649048.00   7246359.24   7536951.00   7706847.00   8010945.00 
##          72%          73%          74%          75%          76%          77% 
##   8457241.00   8746891.55   9134322.00   9920492.50  11517312.00  12052637.97 
##          78%          79%          80%          81%          82%          83% 
##  12249094.00  14379750.00  15182781.20  16772655.45  17733736.74  18370147.00 
##          84%          85%          86%          87%          88%          89% 
##  19785500.00  23100863.00  23505359.00  26303831.91  27975363.72  29763054.02 
##          90%          91%          92%          93%          94%          95% 
##  33065306.00  36919522.00  47807424.63  73378700.49  78844063.50  80180499.99 
##          96%          97%          98%          99%         100% 
##  83182148.00  84670677.73 112508218.03 128558087.07 146717108.20
x <- subset(data$cpc_sum_saldo, data$cpc_sum_saldo <= 220905770.00)
hist(x)

cpc_sum_saldo <- data$cpc_sum_saldo

names(data)[94] # cp_porc_saldo_ing: valor del saldo sobre ingreso por producto
## [1] "cp_porc_saldo_ing"
summary(data$cp_porc_saldo_ing)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[95] # gsm_mejor_gestion: Mejor gestión realizada en el mes anterior
## [1] "gsm_mejor_gestion"
table(data$gsm_mejor_gestion)
## 
##   0  16 
## 901   1
names(data)[96] # dh_min_dia_nomina_c
## [1] "dh_min_dia_nomina_c"
names(data)[97] # dh_max_dia_nomina_c: Ultimo dia en el que recibió pago de nomina en el mes anterior
## [1] "dh_max_dia_nomina_c"
hist(data$dh_max_dia_nomina_c)

names(data)[98] # cp_valor_cuota: Valor de cuota por producto
## [1] "cp_valor_cuota"
summary(data$cp_valor_cuota)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
names(data)[99] # cpc_nro_cuota_tdc: Suma de los número de cuotas de todas las tarjetas de crédito del cliente
## [1] "cpc_nro_cuota_tdc"
names(data)[100] # gsm_prom_dias_gest: Promedio de los días en que se realizaron gestiones en el mes anterior
## [1] "gsm_prom_dias_gest"
names(data)[101] # pc_cuota_no_rot_ent: Cuota de productos no rotativos en el banco
## [1] "pc_cuota_no_rot_ent"
summary(data$pc_cuota_no_rot_ent)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    1527       0 1377000
hist(data$pc_cuota_no_rot_ent)

names(data)[102] # dh_val_nomina_c: Valor total de las trasacciones de entrada por concepto de nomina en un mes
## [1] "dh_val_nomina_c"
summary(data$dh_val_nomina_c)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0   60297       0 7256848
hist(data$dh_val_nomina_c)

quantile(data$dh_val_nomina_c, seq(0,1,by=0.01))
##      0%      1%      2%      3%      4%      5%      6%      7%      8%      9% 
##       0       0       0       0       0       0       0       0       0       0 
##     10%     11%     12%     13%     14%     15%     16%     17%     18%     19% 
##       0       0       0       0       0       0       0       0       0       0 
##     20%     21%     22%     23%     24%     25%     26%     27%     28%     29% 
##       0       0       0       0       0       0       0       0       0       0 
##     30%     31%     32%     33%     34%     35%     36%     37%     38%     39% 
##       0       0       0       0       0       0       0       0       0       0 
##     40%     41%     42%     43%     44%     45%     46%     47%     48%     49% 
##       0       0       0       0       0       0       0       0       0       0 
##     50%     51%     52%     53%     54%     55%     56%     57%     58%     59% 
##       0       0       0       0       0       0       0       0       0       0 
##     60%     61%     62%     63%     64%     65%     66%     67%     68%     69% 
##       0       0       0       0       0       0       0       0       0       0 
##     70%     71%     72%     73%     74%     75%     76%     77%     78%     79% 
##       0       0       0       0       0       0       0       0       0       0 
##     80%     81%     82%     83%     84%     85%     86%     87%     88%     89% 
##       0       0       0       0       0       0       0       0       0       0 
##     90%     91%     92%     93%     94%     95%     96%     97%     98%     99% 
##       0       0       0       0       0       0       0       0       0 3227685 
##    100% 
## 7256848
names(data)[103] # banca_completa: Si el cliente pertenece al segmento banca completa o no
## [1] "banca_completa"
table(data$banca_completa)
## 
##   0 
## 902
names(data)[104] # cpc_saldo_sobre_ing: saldo sobre ingreso por cliente
## [1] "cpc_saldo_sobre_ing"
summary(data$cpc_saldo_sobre_ing)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.01392  0.00000  0.00392  0.75912  0.43980 11.63406
hist(data$cpc_saldo_sobre_ing)

quantile(data$cpc_saldo_sobre_ing, seq(0,1,by=0.01))
##            0%            1%            2%            3%            4% 
## -0.0139196450 -0.0026716620 -0.0004434407  0.0000000000  0.0000000000 
##            5%            6%            7%            8%            9% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           10%           11%           12%           13%           14% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           15%           16%           17%           18%           19% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           20%           21%           22%           23%           24% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           25%           26%           27%           28%           29% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           30%           31%           32%           33%           34% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           35%           36%           37%           38%           39% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           40%           41%           42%           43%           44% 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##           45%           46%           47%           48%           49% 
##  0.0000000000  0.0006086835  0.0036859120  0.0039203230  0.0039203230 
##           50%           51%           52%           53%           54% 
##  0.0039203230  0.0091222870  0.0160144150  0.0241167220  0.0342690350 
##           55%           56%           57%           58%           59% 
##  0.0516975909  0.0762671575  0.0871885689  0.0959601992  0.0996937647 
##           60%           61%           62%           63%           64% 
##  0.1100301990  0.1167548170  0.1181152130  0.1414183478  0.1527656520 
##           65%           66%           67%           68%           69% 
##  0.2038563155  0.2406806960  0.2590162859  0.2652612750  0.2722557190 
##           70%           71%           72%           73%           74% 
##  0.3197385828  0.3433782008  0.3739022130  0.3850143830  0.4290230500 
##           75%           76%           77%           78%           79% 
##  0.4397979330  0.5008516424  0.5767091904  0.6159902625  0.7295992450 
##           80%           81%           82%           83%           84% 
##  0.8236684390  0.8864274193  0.9934920512  1.0631846570  1.1306370686 
##           85%           86%           87%           88%           89% 
##  1.1725284910  1.2668796990  1.4941067817  1.8447025345  1.9151466040 
##           90%           91%           92%           93%           94% 
##  2.1176921894  2.1883557560  2.2844911496  2.9200289999  5.4915779623 
##           95%           96%           97%           98%           99% 
##  5.8321611310  6.7831752090  7.1059412270  7.4480707058  8.7215061950 
##          100% 
## 11.6340582200
x <- subset(data$cpc_saldo_sobre_ing, data$cpc_saldo_sobre_ing <= 20.307294513)
hist(x)

cpc_saldo_sobre_ing <- data$cpc_saldo_sobre_ing

names(data)[105] # dh_min_dia_pago_cred_d: Primer dia en el que realizó pago de credito en el mes anterior
## [1] "dh_min_dia_pago_cred_d"
hist(data$dh_min_dia_pago_cred_d)

dh_min_dia_pago_cred_d <- data$dh_min_dia_pago_cred_d

names(data)[106] # cpc_saldo_tdc: Saldo total en tarjetas de crédito del cliente
## [1] "cpc_saldo_tdc"
summary(data$cpc_saldo_tdc)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   -70455        0    33950  5068261  8010945 36919522
hist(data$cpc_saldo_tdc)

quantile(data$cpc_saldo_tdc, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
##   -70455.00   -65826.00   -20000.00        0.00        0.00        0.00 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         18%         19%         20%         21%         22%         23% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         24%         25%         26%         27%         28%         29% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         30%         31%         32%         33%         34%         35% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         36%         37%         38%         39%         40%         41% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         42%         43%         44%         45%         46%         47% 
##        0.00        0.00        0.00        0.00        0.00     9483.00 
##         48%         49%         50%         51%         52%         53% 
##    31920.00    33950.00    33950.00    33950.00    40620.92   242390.33 
##         54%         55%         56%         57%         58%         59% 
##   350295.68   517486.50   738957.00   985207.00  1425073.34  1883099.63 
##         60%         61%         62%         63%         64%         65% 
##  2025065.80  2296037.81  2585201.00  2642201.49  2905231.56  3156513.95 
##         66%         67%         68%         69%         70%         71% 
##  3494659.42  4090121.68  4507996.36  4949410.41  5262763.00  5664027.00 
##         72%         73%         74%         75%         76%         77% 
##  7246970.96  7536951.00  7706353.52  8010945.00  8318098.24  8616938.17 
##         78%         79%         80%         81%         82%         83% 
##  9134322.00  9296475.24 10437137.80 11181913.72 11687764.00 12248475.71 
##         84%         85%         86%         87%         88%         89% 
## 12852364.00 13440597.70 14740435.54 15089862.00 16685620.00 17175586.00 
##         90%         91%         92%         93%         94%         95% 
## 17905425.00 18549522.56 19785500.00 21713390.07 22926946.00 23100863.00 
##         96%         97%         98%         99%        100% 
## 24470898.84 26387876.00 28790737.34 31877765.63 36919522.00
x <- subset(data$cpc_saldo_tdc, data$cpc_saldo_tdc <= 43186883.96)
hist(x)

cpc_saldo_tdc <- data$cpc_saldo_tdc

names(data)[107] # pc_cuota_de_consumo: Cuota de crédito  de consumo reportada por CIFIN
## [1] "pc_cuota_de_consumo"
summary(data$pc_cuota_de_consumo)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    5096       0 3356000
hist(data$pc_cuota_de_consumo)

names(data)[108] # dh_cant_tras_d: Cantidad de traslados de salida de dinero en un mes
## [1] "dh_cant_tras_d"
hist(data$dh_cant_tras_d)

quantile(data$dh_cant_tras_d, seq(0,1,by=0.005))
##   0.0%   0.5%   1.0%   1.5%   2.0%   2.5%   3.0%   3.5%   4.0%   4.5%   5.0% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##   5.5%   6.0%   6.5%   7.0%   7.5%   8.0%   8.5%   9.0%   9.5%  10.0%  10.5% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  11.0%  11.5%  12.0%  12.5%  13.0%  13.5%  14.0%  14.5%  15.0%  15.5%  16.0% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  16.5%  17.0%  17.5%  18.0%  18.5%  19.0%  19.5%  20.0%  20.5%  21.0%  21.5% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  22.0%  22.5%  23.0%  23.5%  24.0%  24.5%  25.0%  25.5%  26.0%  26.5%  27.0% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  27.5%  28.0%  28.5%  29.0%  29.5%  30.0%  30.5%  31.0%  31.5%  32.0%  32.5% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  33.0%  33.5%  34.0%  34.5%  35.0%  35.5%  36.0%  36.5%  37.0%  37.5%  38.0% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  38.5%  39.0%  39.5%  40.0%  40.5%  41.0%  41.5%  42.0%  42.5%  43.0%  43.5% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  44.0%  44.5%  45.0%  45.5%  46.0%  46.5%  47.0%  47.5%  48.0%  48.5%  49.0% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  49.5%  50.0%  50.5%  51.0%  51.5%  52.0%  52.5%  53.0%  53.5%  54.0%  54.5% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  55.0%  55.5%  56.0%  56.5%  57.0%  57.5%  58.0%  58.5%  59.0%  59.5%  60.0% 
##  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000 
##  60.5%  61.0%  61.5%  62.0%  62.5%  63.0%  63.5%  64.0%  64.5%  65.0%  65.5% 
##  0.000  0.000  0.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000 
##  66.0%  66.5%  67.0%  67.5%  68.0%  68.5%  69.0%  69.5%  70.0%  70.5%  71.0% 
##  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  2.000  2.000  2.000 
##  71.5%  72.0%  72.5%  73.0%  73.5%  74.0%  74.5%  75.0%  75.5%  76.0%  76.5% 
##  2.000  2.000  2.000  2.000  2.000  2.000  3.000  3.000  3.000  3.000  3.000 
##  77.0%  77.5%  78.0%  78.5%  79.0%  79.5%  80.0%  80.5%  81.0%  81.5%  82.0% 
##  3.000  3.000  4.000  4.000  4.000  4.000  4.000  4.000  4.000  5.000  5.000 
##  82.5%  83.0%  83.5%  84.0%  84.5%  85.0%  85.5%  86.0%  86.5%  87.0%  87.5% 
##  5.000  5.000  6.000  6.000  6.000  6.000  6.000  6.000  6.000  7.000  7.000 
##  88.0%  88.5%  89.0%  89.5%  90.0%  90.5%  91.0%  91.5%  92.0%  92.5%  93.0% 
##  7.000  8.000  8.000  9.000  9.000  9.000  9.000 10.000 10.000 10.000 10.000 
##  93.5%  94.0%  94.5%  95.0%  95.5%  96.0%  96.5%  97.0%  97.5%  98.0%  98.5% 
## 11.000 11.940 12.000 13.000 13.000 14.000 15.465 16.000 18.000 20.000 21.000 
##  99.0%  99.5% 100.0% 
## 21.000 21.495 30.000
x <- subset(data$dh_cant_tras_d, data$dh_cant_tras_d <= 31)
hist(x)

dh_cant_tras_d <- data$dh_cant_tras_d

names(data)[109] # dh_max_dia_comisio_d
## [1] "dh_max_dia_comisio_d"
names(data)[110] # cpc_avg_saldo: Promedio del saldo de las obligaciones del cliente en el mes anterior
## [1] "cpc_avg_saldo"
summary(data$cpc_avg_saldo)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    -70455         0     16975   4949118   5029954 103158078
hist(data$cpc_avg_saldo)

quantile(data$cpc_avg_saldo, seq(0,1,by=0.01))
##           0%           1%           2%           3%           4%           5% 
##    -70455.00    -65826.00    -19628.29         0.00         0.00         0.00 
##           6%           7%           8%           9%          10%          11% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          12%          13%          14%          15%          16%          17% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          18%          19%          20%          21%          22%          23% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          24%          25%          26%          27%          28%          29% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          30%          31%          32%          33%          34%          35% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          36%          37%          38%          39%          40%          41% 
##         0.00         0.00         0.00         0.00         0.00         0.00 
##          42%          43%          44%          45%          46%          47% 
##         0.00         0.00         0.00         0.00     14169.66     16975.00 
##          48%          49%          50%          51%          52%          53% 
##     16975.00     16975.00     16975.00    118939.70    305999.12    433154.00 
##          54%          55%          56%          57%          58%          59% 
##    601811.48    796185.15    993033.54   1175050.00   1302303.00   1493580.88 
##          60%          61%          62%          63%          64%          65% 
##   1694424.33   1897335.85   2052581.75   2283110.98   2580505.34   2724391.55 
##          66%          67%          68%          69%          70%          71% 
##   2921941.00   3211054.84   3662326.44   3909918.00   4267080.33   4304110.50 
##          72%          73%          74%          75%          76%          77% 
##   4535613.02   4567161.00   4864807.92   5029954.00   5649048.00   6121840.71 
##          78%          79%          80%          81%          82%          83% 
##   6553668.23   6977784.50   7230429.80   7465720.25   7623077.00   7910066.74 
##          84%          85%          86%          87%          88%          89% 
##   8316979.75   9229880.50  10935486.07  11550431.50  12813655.74  13193938.00 
##          90%          91%          92%          93%          94%          95% 
##  13839320.64  14111779.62  16685620.00  17892250.14  22239378.76  24469808.29 
##          96%          97%          98%          99%         100% 
##  26367451.45  28919972.77  42970063.02  55008435.80 103158078.10
x <- subset(data$cpc_avg_saldo, data$cpc_avg_saldo <= 106884393.50)
hist(x)

cpc_avg_saldo <- data$cpc_avg_saldo

names(data)[111] # dc_max_saldo_sf: máximo saldo en el sistema financiero (Solo Financiero)
## [1] "dc_max_saldo_sf"
summary(data$dc_max_saldo_sf)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0   167955        0 69454000
names(data)[112] # dh_val_pago_tarj_d: Valor total de las trasacciones de salida por concepto de pagos de tdc en un mes
## [1] "dh_val_pago_tarj_d"
summary(data$dh_val_pago_tarj_d)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0   555679    61884 10050161
hist(data$dh_val_pago_tarj_d)

quantile(data$dh_val_pago_tarj_d, seq(0,1,by=0.01))
##          0%          1%          2%          3%          4%          5% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##          6%          7%          8%          9%         10%         11% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         12%         13%         14%         15%         16%         17% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         18%         19%         20%         21%         22%         23% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         24%         25%         26%         27%         28%         29% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         30%         31%         32%         33%         34%         35% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         36%         37%         38%         39%         40%         41% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         42%         43%         44%         45%         46%         47% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         48%         49%         50%         51%         52%         53% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         54%         55%         56%         57%         58%         59% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         60%         61%         62%         63%         64%         65% 
##        0.00        0.00        0.00        0.00        0.00        0.00 
##         66%         67%         68%         69%         70%         71% 
##        0.00        0.00    12200.00    20000.00    31920.00    31920.00 
##         72%         73%         74%         75%         76%         77% 
##    33950.00    33950.00    33950.00    61884.25   127762.36   189655.46 
##         78%         79%         80%         81%         82%         83% 
##   238181.48   453527.86   524195.40   602165.00   803614.04   954853.36 
##         84%         85%         86%         87%         88%         89% 
##  1107856.68  1183850.15  1401638.64  1483828.00  1584879.52  1677979.52 
##         90%         91%         92%         93%         94%         95% 
##  1727225.00  1954311.91  2315173.44  2523524.00  3057532.00  3447819.00 
##         96%         97%         98%         99%        100% 
##  4709415.00  5000000.00  5132114.72  7882268.24 10050161.00
x <- subset(data$dh_val_pago_tarj_d, data$dh_val_pago_tarj_d <= 10560315.0)
hist(x)

dh_val_pago_tarj_d <- data$dh_val_pago_tarj_d

names(data)[113] # pc_productos_no_rotativos_entidad: Cantidad de productos no rotativos en el banco
## [1] "pc_productos_no_rotativos_entidad"
table(data$pc_productos_no_rotativos_entidad) 
## 
##   0   1 
## 901   1
hist(data$pc_productos_no_rotativos_entidad)

names(data)[114] # pc_saldo_no_rot_ent: Obligaciones no rotativos en el banco
## [1] "pc_saldo_no_rot_ent"
summary(data$pc_saldo_no_rot_ent)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0        0        0    40895        0 36887000
names(data)[115] # pc_vi_no_rotativos_entidad: Obligaciones no rotativos en el banco
## [1] "pc_vi_no_rotativos_entidad"
summary(data$pc_vi_no_rotativos_entidad)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     0.00    49.89     0.00 45000.00
names(data)[116] # dh_min_dia_entradas
## [1] "dh_min_dia_entradas"
names(data)[117] # pcons_hipotecario_vivienda: Es el producto un hipotecario
## [1] "pcons_hipotecario_vivienda"
table(data$pcons_hipotecario_vivienda)
## 
##   0   1 
## 879  23
pcons_hipotecario_vivienda <- data$pcons_hipotecario_vivienda

names(data)[118] # gsm_mejor_gestion_3m: Mejor gestión realizada en los últimos 3 meses
## [1] "gsm_mejor_gestion_3m"
table(data$gsm_mejor_gestion_3m)
## 
##   0  16 
## 901   1
names(data)[119] # dh_avg_dia_salidas: Dia promedio del mes en el que hace las salidas de dinero
## [1] "dh_avg_dia_salidas"
names(data)[120] # y_auto_cura: Variable respuesta: si el cliente se autocuro o no
## [1] "llave"
table(data$y_auto_cura)
## < table of extent 0 >
Y <- data$y_auto_cura

names(data)[122] # segmentoestructural: Segmento  estructural
## [1] "subsegmentoestructural"
table(data$segmentoestructural)
## 
## PYME 
##  902
names(data)[123] # subsegmentoestructural
## [1] "anhomes_ciclo"
table(data$subsegmentoestructural)
## 
## Pyme Pequena 
##          902
KEY <- data$llave
length(KEY)
## [1] 902
# Unificar variables

unificado <- as.data.frame(
  cbind(
    max_sem,
    desv_sem,
    prom_bim,
    max_mes_anterior,
    prom_mes_anterior,
    prom_sem,
    max_bim,
    prom_trim,
    pc_cant_moras_30_ult_12_meses,
    desv_trim,
    desv_bim,
    dh_cant_entradas,
    pc_transaccional,
    dh_val_entradas,
    pcons_tarjeta_de_credito,
    dh_avg_dia_retiros_d,
    dmi_max_egreso_diario,
    dh_cant_otros_d,
    dmi_max_ingreso_diario,
    dh_val_otros_d,
    pc_ingreso_final,
    dh_cant_pagos_d,
    dmi_ingreso_total_mes,
    dh_val_pagos_d,
    pc_gasto_familiar,
    pc_cuotas_pagadas,
    pc_ingreso_rutina_con_techo,
    dh_cant_salidas,
    dh_min_dia_pagos_d,
    pc_ingreso_por_rutina,
    dmi_egreso_total_mes,
    dh_val_salidas,
    cpc_avg_nro_cuota,
    cpc_sum_saldo,
    cpc_saldo_sobre_ing,
    dh_min_dia_pago_cred_d,
    cpc_saldo_tdc,
    dh_cant_tras_d,
    cpc_avg_saldo,
    dh_val_pago_tarj_d,
    pcons_hipotecario_vivienda,
    flag_mora60_ult12meses,
    flag_ultima_entrada_1quincena,
    flag_tuvo_mora60_ult3meses,
    flag_diaPago_1quincena,
    flag_es_cluster_6,
    flag_tuvo_mora90_ult12M,
    flax_maxDiaOtrosD_1quincena,
    flag_maxDiaPagos_d_1quincena,
    flag_encontrado_cifin
  )
)
dim(unificado)
## [1] 902  50
names(unificado)
##  [1] "max_sem"                       "desv_sem"                     
##  [3] "prom_bim"                      "max_mes_anterior"             
##  [5] "prom_mes_anterior"             "prom_sem"                     
##  [7] "max_bim"                       "prom_trim"                    
##  [9] "pc_cant_moras_30_ult_12_meses" "desv_trim"                    
## [11] "desv_bim"                      "dh_cant_entradas"             
## [13] "pc_transaccional"              "dh_val_entradas"              
## [15] "pcons_tarjeta_de_credito"      "dh_avg_dia_retiros_d"         
## [17] "dmi_max_egreso_diario"         "dh_cant_otros_d"              
## [19] "dmi_max_ingreso_diario"        "dh_val_otros_d"               
## [21] "pc_ingreso_final"              "dh_cant_pagos_d"              
## [23] "dmi_ingreso_total_mes"         "dh_val_pagos_d"               
## [25] "pc_gasto_familiar"             "pc_cuotas_pagadas"            
## [27] "pc_ingreso_rutina_con_techo"   "dh_cant_salidas"              
## [29] "dh_min_dia_pagos_d"            "pc_ingreso_por_rutina"        
## [31] "dmi_egreso_total_mes"          "dh_val_salidas"               
## [33] "cpc_avg_nro_cuota"             "cpc_sum_saldo"                
## [35] "cpc_saldo_sobre_ing"           "dh_min_dia_pago_cred_d"       
## [37] "cpc_saldo_tdc"                 "dh_cant_tras_d"               
## [39] "cpc_avg_saldo"                 "dh_val_pago_tarj_d"           
## [41] "pcons_hipotecario_vivienda"    "flag_mora60_ult12meses"       
## [43] "flag_ultima_entrada_1quincena" "flag_tuvo_mora60_ult3meses"   
## [45] "flag_diaPago_1quincena"        "flag_es_cluster_6"            
## [47] "flag_tuvo_mora90_ult12M"       "flax_maxDiaOtrosD_1quincena"  
## [49] "flag_maxDiaPagos_d_1quincena"  "flag_encontrado_cifin"
nrow(unificado)
## [1] 902
length(KEY)
## [1] 902
# Dejar solamente las variables que necesito
unificado <- unificado[, which(names(unificado) %in% c(
  'desv_sem',
  'max_mes_anterior',
  'desv_trim',
  'dh_cant_entradas',
  'pcons_tarjeta_de_credito',
  'dh_avg_dia_retiros_d',
  'dh_cant_otros_d',
  'dmi_max_ingreso_diario',
  'dh_val_otros_d',
  'dh_cant_pagos_d',
  'dh_val_pagos_d',
  'pc_cuotas_pagadas',
  'dh_min_dia_pagos_d',
  'pc_ingreso_por_rutina',
  'cpc_saldo_sobre_ing',
  'dh_min_dia_pago_cred_d',
  'cpc_saldo_tdc',
  'dh_cant_tras_d',
  'cpc_avg_saldo',
  'dh_val_pago_tarj_d',
  'flag_tuvo_mora60_ult3meses',
  'flag_diaPago_1quincena',
  'flag_es_cluster_6',
  'flag_tuvo_mora90_ult12M',
  'flax_maxDiaOtrosD_1quincena',
  'flag_encontrado_cifin'
))]
dim(unificado)
## [1] 902  26
names(unificado)
##  [1] "desv_sem"                    "max_mes_anterior"           
##  [3] "desv_trim"                   "dh_cant_entradas"           
##  [5] "pcons_tarjeta_de_credito"    "dh_avg_dia_retiros_d"       
##  [7] "dh_cant_otros_d"             "dmi_max_ingreso_diario"     
##  [9] "dh_val_otros_d"              "dh_cant_pagos_d"            
## [11] "dh_val_pagos_d"              "pc_cuotas_pagadas"          
## [13] "dh_min_dia_pagos_d"          "pc_ingreso_por_rutina"      
## [15] "cpc_saldo_sobre_ing"         "dh_min_dia_pago_cred_d"     
## [17] "cpc_saldo_tdc"               "dh_cant_tras_d"             
## [19] "cpc_avg_saldo"               "dh_val_pago_tarj_d"         
## [21] "flag_tuvo_mora60_ult3meses"  "flag_diaPago_1quincena"     
## [23] "flag_es_cluster_6"           "flag_tuvo_mora90_ult12M"    
## [25] "flax_maxDiaOtrosD_1quincena" "flag_encontrado_cifin"
# dejar solo variables independientes
KEY
##   [1]    1    3    4    6    7    8   10   11   13   14   16   18   19   23   24
##  [16]   25   28   30   32   33   39   40   42   43   46   48   50   51   55   56
##  [31]   57   58   59   60   61   63   65   66   67   70   75   78   87   88   89
##  [46]   93   94   95   96   97   99  100  101  103  104  105  106  107  108  109
##  [61]  111  114  116  119  120  121  122  124  125  126  127  129  130  131  132
##  [76]  133  134  136  138  141  142  143  144  145  147  148  149  152  153  154
##  [91]  155  157  160  163  164  165  169  170  171  177  182  184  185  187  189
## [106]  190  192  195  206  210  211  215  218  219  224  225  226  227  229  231
## [121]  234  235  237  238  241  242  243  245  246  249  250  251  252  254  259
## [136]  260  261  262  265  266  267  268  271  278  280  282  291  293  295  299
## [151]  302  304  305  306  312  313  314  319  321  322  324  326  329  331  334
## [166]  338  342  344  345  348  350  357  359  362  364  365  366  369  371  372
## [181]  373  374  378  383  384  385  386  387  390  391  395  398  400  404  407
## [196]  411  412  413  414  415  420  422  423  424  425  428  433  435  436  437
## [211]  438  440  444  450  452  456  461  462  465  467  468  476  479  482  483
## [226]  484  485  487  488  489  490  492  493  495  497  498  500  504  505  515
## [241]  516  517  530  533  534  537  539  540  541  543  546  547  549  551  552
## [256]  553  554  556  557  558  559  562  563  564  568  572  574  576  577  578
## [271]  581  583  585  586  590  592  593  601  606  607  609  613  614  615  616
## [286]  619  620  621  622  628  629  632  633  635  636  638  640  642  643  645
## [301]  649  650  653  654  657  661  662  664  668  669  671  673  675  676  677
## [316]  680  681  683  685  687  688  689  690  691  694  695  696  704  708  709
## [331]  715  729  730  732  733  734  738  739  746  749  750  757  761  764  765
## [346]  775  781  784  785  787  788  790  792  794  795  798  813  815  816  817
## [361]  818  819  820  821  822  824  826  827  830  831  836  837  838  840  841
## [376]  843  844  849  852  853  854  855  858  865  873  876  885  886  887  893
## [391]  894  901  902  906  907  912  913  914  915  920  921  924  925  926  927
## [406]  931  934  939  941  943  944  949  950  951  954  956  961  962  966  967
## [421]  968  970  973  974  975  977  980  984  986  988  991  993  994  995  998
## [436] 1000 1008 1009 1011 1012 1013 1014 1020 1021 1022 1024 1028 1030 1031 1033
## [451] 1035 1036 1044 1049 1052 1059 1061 1067 1068 1071 1073 1075 1076 1077 1080
## [466] 1082 1083 1085 1086 1088 1092 1093 1095 1096 1098 1099 1107 1108 1110 1111
## [481] 1112 1118 1120 1125 1127 1129 1132 1133 1134 1135 1137 1139 1140 1142 1144
## [496] 1145 1147 1150 1151 1154 1155 1156 1157 1158 1159 1168 1172 1176 1177 1179
## [511] 1182 1185 1186 1193 1195 1196 1199 1205 1206 1207 1210 1213 1215 1216 1218
## [526] 1219 1221 1223 1226 1227 1229 1235 1237 1240 1242 1243 1246 1247 1249 1250
## [541] 1254 1255 1256 1257 1261 1265 1266 1268 1269 1271 1272 1274 1275 1276 1281
## [556] 1292 1304 1313 1314 1315 1316 1317 1320 1321 1322 1323 1324 1328 1330 1331
## [571] 1332 1333 1334 1335 1339 1340 1342 1345 1347 1353 1355 1357 1361 1362 1369
## [586] 1371 1384 1389 1390 1391 1392 1393 1395 1398 1401 1406 1407 1408 1411 1412
## [601] 1416 1420 1424 1426 1430 1435 1436 1437 1438 1439 1440 1442 1443 1444 1447
## [616] 1449 1452 1454 1455 1457 1459 1462 1464 1465 1466 1469 1471 1474 1476 1477
## [631] 1479 1481 1484 1486 1488 1491 1493 1500 1501 1505 1508 1509 1511 1513 1514
## [646] 1519 1522 1523 1524 1527 1528 1529 1530 1533 1535 1542 1543 1544 1545 1548
## [661] 1550 1551 1554 1555 1556 1557 1559 1561 1563 1564 1567 1569 1572 1573 1574
## [676] 1576 1577 1578 1579 1580 1582 1583 1584 1587 1588 1589 1591 1592 1594 1597
## [691] 1599 1601 1602 1605 1606 1607 1609 1610 1611 1613 1615 1618 1619 1620 1622
## [706] 1625 1628 1630 1634 1635 1637 1638 1641 1643 1645 1646 1648 1649 1650 1656
## [721] 1660 1663 1664 1667 1669 1671 1673 1675 1679 1680 1683 1684 1687 1689 1690
## [736] 1693 1696 1699 1700 1704 1707 1708 1710 1711 1715 1716 1717 1718 1721 1722
## [751] 1725 1728 1729 1730 1731 1732 1733 1734 1735 1737 1738 1742 1744 1747 1749
## [766] 1753 1756 1758 1759 1761 1762 1765 1768 1769 1770 1771 1772 1777 1779 1780
## [781] 1781 1783 1784 1786 1787 1789 1792 1794 1795 1797 1802 1803 1806 1808 1809
## [796] 1810 1811 1812 1813 1820 1821 1822 1824 1825 1829 1831 1833 1834 1837 1840
## [811] 1847 1850 1852 1854 1864 1865 1866 1867 1871 1872 1873 1880 1883 1884 1885
## [826] 1889 1895 1896 1898 1899 1904 1905 1906 1907 1908 1909 1910 1911 1913 1916
## [841] 1918 1921 1922 1924 1925 1928 1929 1933 1935 1936 1938 1939 1941 1942 1943
## [856] 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1959 1964
## [871] 1967 1968 1969 1970 1974 1976 1977 1978 1984 1987 1988 1989 1990 1991 1992
## [886] 1993 1994 1997 1998 2000 2005 2007 2010 2012 2013 2014 2017 2019 2021 2025
## [901] 2027 2028
length(KEY)
## [1] 902
X <- unificado
X <- predict(preprocesado, X) # Aplicar Scaling
head(X)
##      desv_sem max_mes_anterior  desv_trim dh_cant_entradas
## 1 -0.25529797      -0.50752897 -1.1941103      -1.06028059
## 2  0.04710176       0.65178349  0.7213348      -1.06028059
## 3  0.01993882      -0.03016501  0.3804477      -1.06028059
## 4  0.65106628       1.94748564  0.8481227       0.07513835
## 5  0.47165449       0.71997834  0.7897396      -0.45337364
## 6  0.73660743      -0.50752897  0.8408677      -0.20198454
##   pcons_tarjeta_de_credito dh_avg_dia_retiros_d dh_cant_otros_d
## 1               -0.6752024           -1.2211367      -0.8645237
## 2               -0.6752024           -1.2211367      -0.8645237
## 3               -0.6752024           -1.2211367      -0.8645237
## 4               -0.6752024            0.5306734       0.8224426
## 5               -0.6752024            0.7530771      -0.8645237
## 6               -0.6752024            0.3631089      -0.1100894
##   dmi_max_ingreso_diario dh_val_otros_d dh_cant_pagos_d dh_val_pagos_d
## 1             -0.5939123     -0.4234230     -0.77779963     -0.5468244
## 2             -0.5939123     -0.4234230     -0.77779963     -0.5468244
## 3             -0.5939123     -0.4234230     -0.77779963     -0.5468244
## 4             -0.3281884     -0.3711720     -0.45552275      0.5283690
## 5             -0.5179912     -0.4234230     -0.13324587     -0.1850389
## 6             -0.2744742     -0.4150344      0.02789256      0.2742168
##   pc_cuotas_pagadas dh_min_dia_pagos_d pc_ingreso_por_rutina
## 1        -0.6194104         -0.6551503            -0.6333276
## 2        -0.6194104         -0.6551503            -0.6333276
## 3        -0.6194104         -0.6551503            -0.3666269
## 4        -0.6194104          0.5026302            -0.6248345
## 5        -0.6194104         -0.3243559            -0.6248345
## 6        -0.6194104         -0.4897531            -0.4254765
##   cpc_saldo_sobre_ing dh_min_dia_pago_cred_d cpc_saldo_tdc dh_cant_tras_d
## 1          -0.3699031             -0.6821349    -0.6288475     -0.4777662
## 2          -0.3699031             -0.6821349    -0.6288475     -0.4777662
## 3          -0.3699031             -0.6821349    -0.6288475     -0.4777662
## 4          -0.3699031              0.2916044    -0.6288475     -0.4777662
## 5          -0.3699031             -0.4039237    -0.6288475     -0.4777662
## 6          -0.3699031             -0.5430293    -0.6288475     -0.4777662
##   cpc_avg_saldo dh_val_pago_tarj_d flag_tuvo_mora60_ult3meses
## 1    -0.4360925         -0.3932686                 -0.3840753
## 2    -0.4360925         -0.3932686                 -0.3840753
## 3    -0.4360925         -0.3932686                 -0.3840753
## 4    -0.4360925         -0.3932686                 -0.3840753
## 5    -0.4360925         -0.3932686                 -0.3840753
## 6    -0.4360925         -0.3932686                 -0.3840753
##   flag_diaPago_1quincena flag_es_cluster_6 flag_tuvo_mora90_ult12M
## 1              0.6415816         -1.008661              -0.2625406
## 2              0.6415816         -1.008661              -0.2625406
## 3              0.6415816         -1.008661              -0.2625406
## 4              0.6415816         -1.008661              -0.2625406
## 5              0.6415816         -1.008661              -0.2625406
## 6              0.6415816         -1.008661              -0.2625406
##   flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
## 1                    0.881491            -1.4521007
## 2                    0.881491            -1.4521007
## 3                    0.881491             0.6886098
## 4                   -1.134363             0.6886098
## 5                    0.881491             0.6886098
## 6                    0.881491             0.6886098
OOT <- cbind(KEY, X)
head(OOT)
##   KEY    desv_sem max_mes_anterior  desv_trim dh_cant_entradas
## 1   1 -0.25529797      -0.50752897 -1.1941103      -1.06028059
## 2   3  0.04710176       0.65178349  0.7213348      -1.06028059
## 3   4  0.01993882      -0.03016501  0.3804477      -1.06028059
## 4   6  0.65106628       1.94748564  0.8481227       0.07513835
## 5   7  0.47165449       0.71997834  0.7897396      -0.45337364
## 6   8  0.73660743      -0.50752897  0.8408677      -0.20198454
##   pcons_tarjeta_de_credito dh_avg_dia_retiros_d dh_cant_otros_d
## 1               -0.6752024           -1.2211367      -0.8645237
## 2               -0.6752024           -1.2211367      -0.8645237
## 3               -0.6752024           -1.2211367      -0.8645237
## 4               -0.6752024            0.5306734       0.8224426
## 5               -0.6752024            0.7530771      -0.8645237
## 6               -0.6752024            0.3631089      -0.1100894
##   dmi_max_ingreso_diario dh_val_otros_d dh_cant_pagos_d dh_val_pagos_d
## 1             -0.5939123     -0.4234230     -0.77779963     -0.5468244
## 2             -0.5939123     -0.4234230     -0.77779963     -0.5468244
## 3             -0.5939123     -0.4234230     -0.77779963     -0.5468244
## 4             -0.3281884     -0.3711720     -0.45552275      0.5283690
## 5             -0.5179912     -0.4234230     -0.13324587     -0.1850389
## 6             -0.2744742     -0.4150344      0.02789256      0.2742168
##   pc_cuotas_pagadas dh_min_dia_pagos_d pc_ingreso_por_rutina
## 1        -0.6194104         -0.6551503            -0.6333276
## 2        -0.6194104         -0.6551503            -0.6333276
## 3        -0.6194104         -0.6551503            -0.3666269
## 4        -0.6194104          0.5026302            -0.6248345
## 5        -0.6194104         -0.3243559            -0.6248345
## 6        -0.6194104         -0.4897531            -0.4254765
##   cpc_saldo_sobre_ing dh_min_dia_pago_cred_d cpc_saldo_tdc dh_cant_tras_d
## 1          -0.3699031             -0.6821349    -0.6288475     -0.4777662
## 2          -0.3699031             -0.6821349    -0.6288475     -0.4777662
## 3          -0.3699031             -0.6821349    -0.6288475     -0.4777662
## 4          -0.3699031              0.2916044    -0.6288475     -0.4777662
## 5          -0.3699031             -0.4039237    -0.6288475     -0.4777662
## 6          -0.3699031             -0.5430293    -0.6288475     -0.4777662
##   cpc_avg_saldo dh_val_pago_tarj_d flag_tuvo_mora60_ult3meses
## 1    -0.4360925         -0.3932686                 -0.3840753
## 2    -0.4360925         -0.3932686                 -0.3840753
## 3    -0.4360925         -0.3932686                 -0.3840753
## 4    -0.4360925         -0.3932686                 -0.3840753
## 5    -0.4360925         -0.3932686                 -0.3840753
## 6    -0.4360925         -0.3932686                 -0.3840753
##   flag_diaPago_1quincena flag_es_cluster_6 flag_tuvo_mora90_ult12M
## 1              0.6415816         -1.008661              -0.2625406
## 2              0.6415816         -1.008661              -0.2625406
## 3              0.6415816         -1.008661              -0.2625406
## 4              0.6415816         -1.008661              -0.2625406
## 5              0.6415816         -1.008661              -0.2625406
## 6              0.6415816         -1.008661              -0.2625406
##   flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
## 1                    0.881491            -1.4521007
## 2                    0.881491            -1.4521007
## 3                    0.881491             0.6886098
## 4                   -1.134363             0.6886098
## 5                    0.881491             0.6886098
## 6                    0.881491             0.6886098
# Probabilidad dataset OOT
KEY <- OOT$KEY
probabilidad <- as.numeric(predict(RF2, OOT, type = "prob")[,2])
hist(probabilidad)

respuesta <- ifelse(probabilidad >= 0.50, 1, 0)
table(respuesta)
## respuesta
##   0   1 
## 379 523
llave <- KEY
imprimir <- as.data.frame(cbind(llave, probabilidad, respuesta))
head(imprimir)
##   llave probabilidad respuesta
## 1     1         0.42         0
## 2     3         0.46         0
## 3     4         0.26         0
## 4     6         0.25         0
## 5     7         0.31         0
## 6     8         0.40         0
table(imprimir$respuesta)
## 
##   0   1 
## 379 523
# write.csv(imprimir,"Base_prueba_evaluado.csv")