library(caret) # para preprocesado (scaling), nearzerovar y findcorrelation
## Loading required package: lattice
## Loading required package: ggplot2
library(DMwR) # para el balanceo SMOTE
## Warning: package 'DMwR' was built under R version 4.0.2
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(car) # para prueba de inflación de la varianza en la regresión
## Loading required package: carData
library(pROC) # para calcular el AUC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROCR) # para calcular el AUC
library(glmnet) # para regresión elastic net
## Warning: package 'glmnet' was built under R version 4.0.2
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(randomForest) # para random forest
## Warning: package 'randomForest' was built under R version 4.0.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(adabag) # para adaboost
## Loading required package: rpart
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
# definir directorio de trabajo
setwd("C:/Users/licja/Downloads/Prueba BAM Javier Saravia")
getwd()
## [1] "C:/Users/licja/Downloads/Prueba BAM Javier Saravia"
original <- read.csv("Base_entrenamiento.csv", header = TRUE)
dim(original) # cargadas 20,000 filas y 124 columnas
## [1] 20000 124
data <- original # sobre este dataset se hará el feature engineer
Los valores atípicos para cada variable se identificaron visualmente a través de histogramas para luego cuantificarlos a través de un análisis de percentiles
# Para los valores atípicos detectados se colocarán acá sus respectivos subset
data <- subset(data, data$pc_transaccional <= 136903394)
data <- subset(data, data$dmi_max_egreso_diario <= 85362328)
data <- subset(data, data$dmi_max_ingreso_diario <= 104909227.3)
data <- subset(data, data$dh_val_otros_d <= 9998335.80)
data <- subset(data, data$dmi_ingreso_total_mes <= 205163705.0)
data <- subset(data, data$dh_val_pagos_d <= 50357360.00)
data <- subset(data, data$pc_gasto_familiar <= 26582180.19)
data <- subset(data, data$pc_cuotas_pagadas <= 8156850.0)
data <- subset(data, data$dmi_egreso_total_mes <= 209829245.0)
data <- subset(data, data$dh_val_salidas <= 217011100)
data <- subset(data, data$cpc_sum_saldo <= 220905770.00)
data <- subset(data, data$cpc_saldo_sobre_ing <= 20.307294513)
data <- subset(data, data$cpc_saldo_tdc <= 43186883.96)
data <- subset(data, data$dh_cant_tras_d <= 31)
data <- subset(data, data$cpc_avg_saldo <= 106884393.50)
data <- subset(data, data$dh_val_pago_tarj_d <= 10560315.0)
data <- subset(data, data$pc_transaccional <= 92269310.0)
Cada variable se exploró individualmente para identificar si requiere transformaciones y/o si se puede utilizar
Muchas variables fueron descartadas por tener solo 0’s, ser nulas, o tener una varianza cercana a 0
Cuando la variable se considera apta para uso, se guarda en una variable independiente; posteriormente se unificarán todas esas variables
Si bien este es un ejercicio, pueden hacerse las siguientes observaciones y sugerencias:
Muchas variables contienen solo 0’s, muchas otras tienen casi toda la frecuencia concentrada en una sola clase, lo que hace que la frecuencia sea cercana a 0
Muchas variables son repetitivas acerca del momento del mes en qué recibió algún depósito, hizo algún pago, retiro, etc. Creo que medir la variable así no solo causará multicolinealidad con las demás variables parecidas sino además tendrá un componente estacional, volviendóse así auto-regresiva, creo que es mejor hacer una variable binaria que indique si la transacción fue en la primera o segunda quincena del mes, además estaríamos con ello incrementando la varianza.
Sugiero variables como:
Faltan todas las socio demográficas del cliente
Interacción del cliente con el banco: cantidad de productos, suma del saldo de productos, suma del monto de productos, antigüedad o tiempo de ser cliente, etc.
Las transacciones en cuentas de pasivos se pueden categorizar, así tendríamos variables como: suma_educacion, suma_combustible, suma_restaurantes, suma_telefonia, etc.
Considero que de los burós podemos sacar mucha más información: cantidad, saldo y monto de tarjetas de crédito, créditos fiduciarios y otros créditos, inclusive comportamiento en créditos comerciales y pago de servicios
A nivel de banco y nivel de buró elaboraría muchas más variables referentes a mora: max_mora_Nmeses, cantidad_moraN_Nmeses, desv_maxMora_Nmeses, etc.
Podemos agregar también variables sobre los canales, frecuencia y modo en qué retira su dinero: ATM, transferencias, Tarjeta de Débito, etc.
# Se crearán variables independientes ya con transformaciones
# Exceptuando las columnas que no se consideren aptas para el modelo
# Posteriormente, se creará un solo dataset con todas ellas
# Se crearán variables numéricas para usarse en los modelos lineales (reg logística y elastic net)
# Se determinará qué variables dejar evaluando que la varianza no sea cercana a 0
# Además, se eliminará las variables independientes con alta correlación entre sí
# Muchas variables tienen frecuencia alta cerca de 0 (asimetría positiva), por lo que seguramente serán descartadas
names(data)[1] # max_trim: máximo días de mora trimestre anterior
## [1] "max_trim"
hist(data$max_trim)
hist(sqrt(data$max_trim)) # variable normalizada con sqrt
max_trim <- sqrt(data$max_trim)
names(data)[2] # max_sem: máximo días de mora en el semestre anterior
## [1] "max_sem"
hist(data$max_sem)
hist(sqrt(data$max_sem))
max_sem <- data$max_sem
names(data)[3] # desv_sem: desviación estándar del máximo de moras en el semestre anterior
## [1] "desv_sem"
hist(data$desv_sem)
hist(sqrt(data$desv_sem))
desv_sem <- data$desv_sem
names(data)[4] # prom_bim: promedio del máximo de moras en el bimestre anterior
## [1] "prom_bim"
hist(data$prom_bim)
hist(sqrt(data$prom_bim))
prom_bim <- sqrt(data$prom_bim)
names(data)[5] # max_mes_anterior: días de mora máximo en el mes anterior
## [1] "max_mes_anterior"
hist(data$max_mes_anterior)
hist(sqrt(data$max_mes_anterior))
max_mes_anterior <- data$max_mes_anterior
names(data)[6] # prom_mes_anterior: Promedio de los dias de mora en el mes anterior
## [1] "prom_mes_anterior"
hist(data$prom_mes_anterior)
hist(sqrt(data$prom_mes_anterior))
prom_mes_anterior <- sqrt(data$prom_mes_anterior)
names(data)[7] # prom_sem: promedio del máximo de moras en el semestre anterior
## [1] "prom_sem"
hist(data$prom_sem)
hist(sqrt(data$prom_sem))
prom_sem <- data$prom_sem
names(data)[8] # max_bim: máximo días de mora en el bimestre anterior
## [1] "max_bim"
hist(data$max_bim)
hist(sqrt(data$max_bim))
max_bim <- data$max_bim
names(data)[9] # mejor_gestion: Mejor gestion realizada
## [1] "mejor_gestion"
table(data$mejor_gestion)
##
## 0 5 6 7 16
## 18031 1 3 1 10
names(data)[10] # prom_trim: promedio del máximo de moras en el trimestre anterior
## [1] "prom_trim"
hist(data$prom_trim)
hist(sqrt(data$prom_trim))
prom_trim <- sqrt(data$prom_trim)
names(data)[11] # pc_cant_moras_30_ult_12_meses: Cantidad de moras 30 en los últimos 12 meses <= 5 o vacío.
## [1] "pc_cant_moras_30_ult_12_meses"
summary(data$pc_cant_moras_30_ult_12_meses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.298 1.000 12.000
table(data$pc_cant_moras_30_ult_12_meses)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 11672 2024 1146 919 500 333 321 188 182 192 196 176 197
pc_cant_moras_30_ult_12_meses <- data$pc_cant_moras_30_ult_12_meses
names(data)[12] # desv_trim: desviación estándar del máximo de moras en el trimestre anterior
## [1] "desv_trim"
hist(data$desv_trim)
hist(sqrt(data$desv_trim))
desv_trim <- sqrt(data$desv_trim)
names(data)[13] # nro_gestiones: Numero de gestiones realizadas
## [1] "nro_gestiones"
table(data$nro_gestiones)
##
## 0 1 2 3 5
## 18031 6 4 4 1
names(data)[14] # desv_bim: desviación estándar del máximo días de mora en el bimestre anterior
## [1] "desv_bim"
hist(data$desv_bim)
hist(sqrt(data$desv_bim))
desv_bim <- sqrt(data$desv_bim)
names(data)[15] # pc_cant_moras_30_ult_3_meses: Cantidad de moras 30 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_30_ult_3_meses"
summary(data$pc_cant_moras_30_ult_3_meses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4745 0.0000 3.0000
hist(data$pc_cant_moras_30_ult_3_meses)
names(data)[16] # dh_cant_entradas: cantidad de trasacciones de ingreso de dinero tuvo en el mes anterior
## [1] "dh_cant_entradas"
summary(data$dh_cant_entradas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 5.00 11.52 17.00 139.00
hist(data$dh_cant_entradas)
hist(sqrt(data$dh_cant_entradas))
dh_cant_entradas <- sqrt(data$dh_cant_entradas)
names(data)[17] # pc_tiem_1er_prod_abierto_total: Tiempo total de producto abierto
## [1] "pc_tiem_1er_prod_abierto_total"
table(data$pc_tiem_1er_prod_abierto_total)
##
## 0 2 3 5 11 14 16 17 18 19 21 22 24
## 17794 6 2 5 3 1 5 1 2 1 4 1 1
## 25 26 27 28 29 30 31 33 34 35 38 39 41
## 2 5 2 5 4 2 5 2 4 1 2 35 2
## 42 44 46 48 49 50 51 52 53 54 55 56 57
## 1 3 3 2 21 4 2 2 1 5 4 2 2
## 58 59 60 61 62 63 65 66 67 68 69 70 71
## 9 3 1 1 3 2 2 4 2 8 3 3 2
## 74 77 79 81 82 88 94 97 104 105 107 108 110
## 1 1 2 1 4 1 2 3 2 1 1 3 1
## 111 115 117 121 124 126 128 129 138 141 143 144 146
## 1 1 1 1 2 3 2 1 6 3 1 1 2
## 169 171 176 179 299
## 1 1 2 1 1
hist(data$pc_tiem_1er_prod_abierto_total)
names(data)[18] # pc_cant_moras_60_ult_12_meses: Cantidad de moras 60 en los últimos 12 meses <= 1 o vacío.
## [1] "pc_cant_moras_60_ult_12_meses"
table(data$pc_cant_moras_60_ult_12_meses)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 14905 1052 528 353 230 130 147 175 128 132 105 87 74
hist(data$pc_cant_moras_60_ult_12_meses) # se optará por hacerla binaria
x <- ifelse(data$pc_cant_moras_60_ult_12_meses >= 1, 1, 0)
table(x)
## x
## 0 1
## 14905 3141
flag_mora60_ult12meses <- x
names(data)[19] # gestiones_eficaces: Cantidad de gestiones eficaces
## [1] "gestiones_eficaces"
table(data$gestiones_eficaces)
##
## 0 1 2 3
## 18031 11 3 1
names(data)[20] # pc_transaccional: Ingreso de acuerdo al estimador transaccional del cliente
## [1] "pc_transaccional"
summary(data$pc_transaccional)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2043200 7547230 14096941 19069978 92269310
boxplot(data$pc_transaccional) # valores atípicos en el lado superior de la variable
quantile(data$pc_transaccional, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5% 6%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 7% 8% 9% 10% 11% 12% 13%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 14% 15% 16% 17% 18% 19% 20%
## 0.0 0.0 57600.0 248650.0 410806.8 665000.0 910000.0
## 21% 22% 23% 24% 25% 26% 27%
## 1089945.0 1381618.2 1616750.0 1840077.7 2043200.0 2266212.4 2500000.0
## 28% 29% 30% 31% 32% 33% 34%
## 2667730.5 2880776.6 3000000.0 3192751.5 3356433.2 3512153.2 3723150.6
## 35% 36% 37% 38% 39% 40% 41%
## 3963014.5 4201883.2 4410000.0 4560285.8 4812979.0 5058932.8 5239323.0
## 42% 43% 44% 45% 46% 47% 48%
## 5555416.6 5822665.3 6149353.8 6386744.2 6600000.0 6810155.7 7000000.0
## 49% 50% 51% 52% 53% 54% 55%
## 7276333.0 7547229.5 7944952.3 8239311.7 8461116.5 8765504.8 9050000.0
## 56% 57% 58% 59% 60% 61% 62%
## 9400018.2 9727957.2 10067467.5 10400000.0 10780000.0 11350000.0 11863116.3
## 63% 64% 65% 66% 67% 68% 69%
## 12154410.1 12560835.0 12988541.8 13632351.8 14295950.0 14702743.6 15137238.4
## 70% 71% 72% 73% 74% 75% 76%
## 15629789.5 16275052.0 16971838.3 17545912.3 18452323.0 19069977.5 19750000.0
## 77% 78% 79% 80% 81% 82% 83%
## 20328585.3 21272355.0 22094848.0 23043772.4 24400000.0 25657795.3 27167290.0
## 84% 85% 86% 87% 88% 89% 90%
## 28450000.0 30005655.5 31122150.0 32515000.0 34360117.8 36540016.0 38620000.0
## 91% 92% 93% 94% 95% 96% 97%
## 41672027.0 43349004.2 45725673.9 49521221.0 52488819.3 57918192.0 63254555.0
## 98% 99% 100%
## 70506644.0 79003883.0 92269310.0
x <- subset(data$pc_transaccional, data$pc_transaccional <= 92269310.0)
hist(x)
pc_transaccional <- data$pc_transaccional
names(data)[21] # dh_max_dia_entradas: Ultimo dia en que recibio alguna transaccion de ingreso de dinero
## [1] "dh_max_dia_entradas"
table(data$dh_max_dia_entradas)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 6166 10 31 11 17 27 29 27 46 51 53 46 83 65 97 71
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 92 79 69 74 105 155 168 201 214 183 196 331 1360 449 2911 4629
hist(data$dh_max_dia_entradas) # se volverá binaria
x <- ifelse(data$dh_max_dia_entradas <= 15, 1, 0)
table(x)
## x
## 0 1
## 11216 6830
flag_ultima_entrada_1quincena <- x
names(data)[22] # pc_cupo_entidad: Cupo de las tarjetas de crédito en el banco
## [1] "pc_cupo_entidad"
summary(data$pc_cupo_entidad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 137908 0 36000000
hist(data$pc_cupo_entidad)
table(ifelse(data$pc_cupo_entidad >= 1, 1, 0))
##
## 0 1
## 17834 212
names(data)[23] # pc_cuotas_como_ppal: Cuotas pagadas como principal
## [1] "pc_cuotas_como_ppal"
summary(data$pc_cuotas_como_ppal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 49160 0 8156850
hist(data$pc_cuotas_como_ppal)
names(data)[24] # dh_val_entradas: Valor total de los ingresos tomados en el mes anterior
## [1] "dh_val_entradas"
summary(data$dh_val_entradas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 5088944 18769832 26188088 204764342
hist(data$dh_val_entradas)
hist(sqrt(data$dh_val_entradas))
dh_val_entradas <- sqrt(data$dh_val_entradas)
names(data)[25] # pc_cant_moras_90_ult_12_meses: Cantidad de moras 90 o superiores en los últimos 12 meses = 0 o vacío.
## [1] "pc_cant_moras_90_ult_12_meses"
table(data$pc_cant_moras_90_ult_12_meses)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 16250 454 310 163 149 151 106 108 111 92 69 33 50
hist(data$pc_cant_moras_90_ult_12_meses)
hist(sqrt(data$pc_cant_moras_90_ult_12_meses))
names(data)[26] # dh_max_dia_salidas
## [1] "dh_max_dia_salidas"
# prefiero quedarme con la misma variable pero para el mes actual
names(data)[27] # pc_cant_moras_60_ult_3_meses: Cantidad de moras 60 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_60_ult_3_meses"
table(data$pc_cant_moras_60_ult_3_meses)
##
## 0 1 2 3
## 15713 833 443 1057
table(ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0))
##
## 0 1
## 15713 2333
flag_tuvo_mora60_ult3meses <- ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0)
names(data)[28] # pc_cuota_tarjeta_de_credito: Cuota de tarjeta de crédito reportada por CIFIN
## [1] "pc_cuota_tarjeta_de_credito"
summary(data$pc_cuota_tarjeta_de_credito)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 14910 0 5445000
hist(data$pc_cuota_tarjeta_de_credito)
table(ifelse(data$pc_cuota_tarjeta_de_credito >= 1, 1, 0))
##
## 0 1
## 17803 243
names(data)[29] # cp_inicial_menos_saldo: valor inicial menos el saldo en el mes anterior por producto
## [1] "cp_inicial_menos_saldo"
summary(data$cp_inicial_menos_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[30] # pc_peor_estado_act_cta_aho: Peor estado cuenta ahorro
## [1] "pc_peor_estado_act_cta_aho"
summary(data$pc_peor_estado_act_cta_aho)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04034 0.00000 3.00000
hist(data$pc_peor_estado_act_cta_aho)
names(data)[31] # dia_pago: Dia de pago de la obligacion en el mes
## [1] "dia_pago"
table(data$dia_pago)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 599 314 4072 227 238 226 258 203 210 208 278 233 277 215 5161 245
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 1856 221 270 200 220 283 235 206 232 250 267 274 188 333 47
x <- ifelse(data$dia_pago <= 15, 1, 0)
table(x)
## x
## 0 1
## 5327 12719
flag_diaPago_1quincena <- x
names(data)[32] # cp_cuotas_falta: Cantidad de cuotas faltantes
## [1] "cp_cuotas_falta"
table(data$cp_cuotas_falta)
##
## 0
## 18046
names(data)[33] # pcons_tarjeta_de_credito: Es el producto una tarjeta de credito
## [1] "pcons_tarjeta_de_credito"
table(data$pcons_tarjeta_de_credito)
##
## 0 1
## 12404 5642
pcons_tarjeta_de_credito <- data$pcons_tarjeta_de_credito
names(data)[34] # pc_cifin: Ingreso del cliente de acuerdo a CIFIN
## [1] "pc_cifin"
summary(data$pc_cifin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 77941 0 23509011
hist(data$pc_cifin)
names(data)[35] # gestiones_prod: Numero de gestiones productivas
## [1] "gestiones_prod"
table(data$gestiones_prod)
##
## 0 1 2 3
## 18036 8 1 1
names(data)[36] # pcons_vehiculos_sufi: Es el producto un vehiculo sufi
## [1] "pcons_vehiculos_sufi"
table(data$pcons_vehiculos_sufi)
##
## 0 1
## 16990 1056
names(data)[37] # cluster_recod: segmento cluster
## [1] "cluster_recod"
table(data$cluster_recod)
##
## 1 6 7 9 13 16 19
## 99 9105 5284 49 1867 1008 634
x <- ifelse(data$cluster_recod == 6, 1, 0)
table(x)
## x
## 0 1
## 8941 9105
flag_es_cluster_6 <- x
names(data)[38] # dh_avg_dia_retiros_d: Dia promedio del mes en el que realiza los retiros
## [1] "dh_avg_dia_retiros_d"
hist(data$dh_avg_dia_retiros_d)
dh_avg_dia_retiros_d <- data$dh_avg_dia_retiros_d
names(data)[39] # dmi_max_egreso_diario: Maximo egreso en un dia del mes anterior
## [1] "dmi_max_egreso_diario"
summary(data$dmi_max_egreso_diario)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 2396128 6685783 9264177 85362328
hist(data$dmi_max_egreso_diario)
quantile(data$dmi_max_egreso_diario, seq(0,1,by = 0.01))
## 0% 1% 2% 3% 4% 5% 6%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 7% 8% 9% 10% 11% 12% 13%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 14% 15% 16% 17% 18% 19% 20%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 21% 22% 23% 24% 25% 26% 27%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 28% 29% 30% 31% 32% 33% 34%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 35% 36% 37% 38% 39% 40% 41%
## 0.0 15805.6 108247.0 355864.7 579439.1 791444.0 1003106.8
## 42% 43% 44% 45% 46% 47% 48%
## 1152466.0 1308571.8 1472964.0 1600000.0 1782669.0 1888031.4 2008000.0
## 49% 50% 51% 52% 53% 54% 55%
## 2170653.4 2396128.5 2579869.1 2765324.0 2936515.0 3076251.0 3276314.0
## 56% 57% 58% 59% 60% 61% 62%
## 3537956.0 3805458.8 4000000.0 4148169.0 4461603.0 4688264.5 4987219.0
## 63% 64% 65% 66% 67% 68% 69%
## 5041687.1 5342686.0 5668527.5 6001403.5 6279611.5 6622351.2 7028000.0
## 70% 71% 72% 73% 74% 75% 76%
## 7400503.0 7733143.0 8032000.0 8416410.3 8911504.0 9264177.0 9856840.0
## 77% 78% 79% 80% 81% 82% 83%
## 10040000.0 10276101.0 10696562.4 11244800.0 11856206.8 12224783.0 12825890.5
## 84% 85% 86% 87% 88% 89% 90%
## 13536058.6 14305838.5 15037327.9 15742756.1 16566000.0 17589063.0 18992449.0
## 91% 92% 93% 94% 95% 96% 97%
## 20080000.0 21146267.0 22772981.0 25016338.8 27306876.0 30278258.2 35581436.4
## 98% 99% 100%
## 40866473.0 52347170.7 85362328.0
x <- subset(data$dmi_max_egreso_diario, data$dmi_max_egreso_diario <= 85362328)
hist(x)
dmi_max_egreso_diario <- data$dmi_max_egreso_diario
names(data)[40] # cpc_max_proc_deuda: Máximo del porcentaje de la deuda en el mes anterior
## [1] "cpc_max_proc_deuda"
summary(data$cpc_max_proc_deuda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -28.1820 0.0000 0.2023 0.4324 0.9399 13.8796
names(data)[41] # dh_cant_otros_d: Cantidad de trasacciones de salida por concepto de otros
## [1] "dh_cant_otros_d"
summary(data$dh_cant_otros_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 3.092 4.000 74.000
hist(data$dh_cant_otros_d)
hist(sqrt(data$dh_cant_otros_d))
dh_cant_otros_d <- sqrt(data$dh_cant_otros_d)
names(data)[42] # pc_cont_30_lt_12m_tot_sf: Cantidad mora 30 últimos 12 meses sector financiero
## [1] "pc_cont_30_lt_12m_tot_sf"
summary(data$pc_cont_30_lt_12m_tot_sf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0681 0.0000 64.0000
hist(data$pc_cont_30_lt_12m_tot_sf)
table(data$pc_cont_30_lt_12m_tot_sf)
##
## 0 1 2 3 5 6 64
## 17999 13 5 3 3 5 18
names(data)[43] # pc_cant_mora90_ult_12m_total
## [1] "pc_cant_mora90_ult_12m_total"
table(data$pc_cant_mora90_ult_12m_total)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 16882 212 145 123 84 121 79 102 97 82 57 27 35
table(ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0))
##
## 0 1
## 16882 1164
x <- ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0)
flag_tuvo_mora90_ult12M <- x
names(data)[44] # dmi_max_ingreso_diario: Maximo ingreso en un dia del mes anterior
## [1] "dmi_max_ingreso_diario"
summary(data$dmi_max_ingreso_diario)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 2540003 7845783 10000027 103018788
hist(data$dmi_max_ingreso_diario)
quantile(data$dmi_max_ingreso_diario, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 6% 7% 8% 9% 10% 11%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 12% 13% 14% 15% 16% 17%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 18% 19% 20% 21% 22% 23%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 24% 25% 26% 27% 28% 29%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 30% 31% 32% 33% 34% 35%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 36% 37% 38% 39% 40% 41%
## 7.2 22203.0 195000.0 490000.0 700000.0 960900.0
## 42% 43% 44% 45% 46% 47%
## 1092137.9 1300000.0 1500000.0 1656152.0 1900000.0 2000000.0
## 48% 49% 50% 51% 52% 53%
## 2100000.0 2328568.1 2540003.0 2750003.0 3000000.0 3050000.0
## 54% 55% 56% 57% 58% 59%
## 3341821.6 3600000.0 3950000.0 4032169.0 4300000.0 4594069.3
## 60% 61% 62% 63% 64% 65%
## 4950000.0 5000000.0 5193296.1 5500000.0 6000000.0 6190591.0
## 66% 67% 68% 69% 70% 71%
## 6525936.2 7000000.0 7262786.6 7724640.1 8000001.0 8560257.4
## 72% 73% 74% 75% 76% 77%
## 9000000.0 9604260.0 10000000.0 10000026.8 10340000.0 10842262.6
## 78% 79% 80% 81% 82% 83%
## 11175227.0 12000000.0 12437389.0 13000000.0 13758802.2 14462555.0
## 84% 85% 86% 87% 88% 89%
## 15200041.0 16124906.2 17541904.3 18687254.4 19862000.0 20148078.3
## 90% 91% 92% 93% 94% 95%
## 21350000.0 23972400.0 25872101.6 28563056.0 30227878.6 33514149.0
## 96% 97% 98% 99% 100%
## 38000000.0 43250075.4 50045136.0 67883640.4 103018788.0
x <- subset(data$dmi_max_ingreso_diario, data$dmi_max_ingreso_diario <= 104909227.3)
hist(x)
dmi_max_ingreso_diario <- data$dmi_max_ingreso_diario
names(data)[45] # dh_val_otros_d: Valor total de las trasacciones de salida por concepto de otros en un mes
## [1] "dh_val_otros_d"
summary(data$dh_val_otros_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 7586 513468 396635 9932459
hist(data$dh_val_otros_d)
quantile(data$dh_val_otros_d, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 7% 8% 9% 10% 11% 12% 13%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 14% 15% 16% 17% 18% 19% 20%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 21% 22% 23% 24% 25% 26% 27%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 28% 29% 30% 31% 32% 33% 34%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 35% 36% 37% 38% 39% 40% 41%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 42% 43% 44% 45% 46% 47% 48%
## 0.00 0.00 0.00 0.00 0.00 900.00 1919.00
## 49% 50% 51% 52% 53% 54% 55%
## 3222.00 7586.00 10100.00 12483.00 13271.00 13271.00 14360.50
## 56% 57% 58% 59% 60% 61% 62%
## 17452.60 20760.00 23736.00 29835.00 36771.00 49855.45 62593.60
## 63% 64% 65% 66% 67% 68% 69%
## 75757.00 90129.00 104071.00 125011.80 146179.00 175034.00 202801.25
## 70% 71% 72% 73% 74% 75% 76%
## 229524.50 253039.00 280656.40 311751.40 348722.00 396634.75 436151.80
## 77% 78% 79% 80% 81% 82% 83%
## 482810.00 528182.00 573716.00 626437.00 689522.20 759090.00 837712.00
## 84% 85% 86% 87% 88% 89% 90%
## 934669.60 1013325.00 1107249.00 1229253.00 1340862.20 1509363.45 1692885.00
## 91% 92% 93% 94% 95% 96% 97%
## 1844536.00 2006918.00 2257572.45 2517164.00 2796438.00 3337151.80 4026117.00
## 98% 99% 100%
## 4825491.20 6503208.90 9932459.00
x <- subset(data$dh_val_otros_d, data$dh_val_otros_d <= 9998335.80)
hist(x)
dh_val_otros_d <- data$dh_val_otros_d
names(data)[46] # pc_ingreso_final: Ingreso de final del cliente
## [1] "pc_ingreso_final"
summary(data$pc_ingreso_final)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2000000 7221917 13140723 17741170 92226100
hist(data$pc_ingreso_final)
pc_ingreso_final <- data$pc_ingreso_final
names(data)[47] # dh_cant_pagos_d: Cantidad de pagos de salidas tuvo en el mes anterior
## [1] "dh_cant_pagos_d"
summary(data$dh_cant_pagos_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 3.000 4.864 8.000 49.000
hist(data$dh_cant_pagos_d)
dh_cant_pagos_d <- data$dh_cant_pagos_d
names(data)[48] # dmi_ingreso_total_mes: Ingreso total del mes anterior
## [1] "dmi_ingreso_total_mes"
summary(data$dmi_ingreso_total_mes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 5088944 18769832 26188088 204764342
hist(data$dmi_ingreso_total_mes)
quantile(data$dmi_ingreso_total_mes, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 36% 37% 38% 39% 40% 41%
## 8.00 24049.25 250005.50 630000.00 1056000.00 1400006.45
## 42% 43% 44% 45% 46% 47%
## 1838897.00 2199726.35 2600000.00 3000000.00 3300002.00 3790873.90
## 48% 49% 50% 51% 52% 53%
## 4106175.00 4564100.00 5088944.00 5550008.00 6004109.00 6639751.45
## 54% 55% 56% 57% 58% 59%
## 7250008.40 8010000.00 8667401.40 9450000.00 10150000.00 10675000.00
## 60% 61% 62% 63% 64% 65%
## 11550000.00 12200000.00 13019188.80 13904977.15 14787240.80 15525970.00
## 66% 67% 68% 69% 70% 71%
## 16474081.20 17555134.00 18585215.00 19600704.25 20290000.00 21541384.00
## 72% 73% 74% 75% 76% 77%
## 22528017.00 23630000.00 25156589.10 26188088.00 27948415.80 29405868.00
## 78% 79% 80% 81% 82% 83%
## 30600887.80 32212302.85 34000005.00 35944963.75 37561559.20 39459092.60
## 84% 85% 86% 87% 88% 89%
## 41691250.00 44302738.75 46987343.00 49143482.00 51159723.00 54010666.00
## 90% 91% 92% 93% 94% 95%
## 57475971.50 61466784.45 65288828.60 68862425.00 74272415.00 81843676.25
## 96% 97% 98% 99% 100%
## 88714237.00 98130984.65 111743381.40 132321500.00 204764342.00
x <- subset(data$dmi_ingreso_total_mes, data$dmi_ingreso_total_mes <= 205163705.0)
hist(x)
dmi_ingreso_total_mes <- data$dmi_ingreso_total_mes
names(data)[49] # dh_val_pagos_d: Valor total de las trasacciones de salida por concepto de pagos otros
## [1] "dh_val_pagos_d"
summary(data$dh_val_pagos_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 551214 3019832 3753468 50357360
hist(data$dh_val_pagos_d)
quantile(data$dh_val_pagos_d, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5% 6%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 7% 8% 9% 10% 11% 12% 13%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 14% 15% 16% 17% 18% 19% 20%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 21% 22% 23% 24% 25% 26% 27%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 28% 29% 30% 31% 32% 33% 34%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 35% 36% 37% 38% 39% 40% 41%
## 0.0 0.0 0.0 0.0 1953.0 20000.0 40345.0
## 42% 43% 44% 45% 46% 47% 48%
## 63499.9 95093.0 127370.0 167836.2 227337.3 298628.5 373379.0
## 49% 50% 51% 52% 53% 54% 55%
## 458507.0 551214.0 645925.8 749470.0 854844.9 947913.0 1051616.0
## 56% 57% 58% 59% 60% 61% 62%
## 1162545.0 1283067.2 1388163.9 1500013.5 1610615.0 1710256.8 1847643.0
## 63% 64% 65% 66% 67% 68% 69%
## 1956650.8 2070132.4 2221932.8 2344112.0 2504857.0 2665210.0 2818400.0
## 70% 71% 72% 73% 74% 75% 76%
## 2961302.0 3074574.8 3221620.4 3432337.0 3589801.0 3753468.5 3964881.6
## 77% 78% 79% 80% 81% 82% 83%
## 4183091.0 4404183.0 4666203.0 4931637.0 5219968.5 5461909.0 5688066.2
## 84% 85% 86% 87% 88% 89% 90%
## 6002214.6 6270027.5 6668134.8 7100857.4 7530003.8 8157880.3 8782633.0
## 91% 92% 93% 94% 95% 96% 97%
## 9337860.7 10308822.0 11291463.0 12556160.2 13782306.0 15175063.0 17581258.2
## 98% 99% 100%
## 20688616.2 27004379.6 50357360.0
x <- subset(data$dh_val_pagos_d, data$dh_val_pagos_d <= 50357360.00)
hist(x)
dh_val_pagos_d <- data$dh_val_pagos_d
names(data)[50] # pc_gasto_familiar: Valor Gasto de familiar del cliente
## [1] "pc_gasto_familiar"
summary(data$pc_gasto_familiar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 723816 2070132 3368379 4335488 23067328
quantile(data$pc_gasto_familiar, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 7% 8% 9% 10% 11% 12% 13%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 14% 15% 16% 17% 18% 19% 20%
## 0.0 0.0 52827.5 144582.5 246500.0 350000.0 430100.0
## 21% 22% 23% 24% 25% 26% 27%
## 521914.7 588486.8 637956.8 687452.3 723815.7 763864.1 782813.0
## 28% 29% 30% 31% 32% 33% 34%
## 803882.1 836711.5 861377.8 882000.0 931155.0 975490.5 1039732.4
## 35% 36% 37% 38% 39% 40% 41%
## 1088643.0 1146363.2 1202858.0 1262250.0 1306491.8 1363816.0 1441944.3
## 42% 43% 44% 45% 46% 47% 48%
## 1519899.3 1583532.9 1653079.6 1712143.6 1775832.6 1845026.6 1910025.3
## 49% 50% 51% 52% 53% 54% 55%
## 2009880.8 2070131.5 2113333.7 2180887.5 2241724.8 2292477.7 2340728.6
## 56% 57% 58% 59% 60% 61% 62%
## 2403626.6 2454449.8 2507395.5 2550000.0 2582812.1 2629610.5 2698945.9
## 63% 64% 65% 66% 67% 68% 69%
## 2785419.6 2861250.0 2982375.6 3111782.2 3124333.0 3230361.3 3359112.7
## 70% 71% 72% 73% 74% 75% 76%
## 3545186.2 3722838.1 3892993.9 4043832.1 4196875.0 4335487.7 4530732.8
## 77% 78% 79% 80% 81% 82% 83%
## 4670284.2 4871802.6 5111082.5 5362225.0 5630442.3 5912383.7 6253169.6
## 84% 85% 86% 87% 88% 89% 90%
## 6542500.0 6842430.6 7286724.5 7673500.0 8172391.9 8594050.8 9055328.7
## 91% 92% 93% 94% 95% 96% 97%
## 9534963.8 10206885.0 10737379.9 11311996.9 12263970.4 13361803.4 14807103.4
## 98% 99% 100%
## 16718602.0 18943025.0 23067327.5
x <- subset(data$pc_gasto_familiar, data$pc_gasto_familiar <= 26582180.19)
hist(x)
pc_gasto_familiar <- data$pc_gasto_familiar
names(data)[51] # pc_cuotas_pagadas: Valor Cuotas de pagadas del cliente
## [1] "pc_cuotas_pagadas"
summary(data$pc_cuotas_pagadas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 290000 704829 900899 8156850
hist(data$pc_cuotas_pagadas)
quantile(data$pc_cuotas_pagadas, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5% 6% 7%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 8% 9% 10% 11% 12% 13% 14% 15%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 16% 17% 18% 19% 20% 21% 22% 23%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 24% 25% 26% 27% 28% 29% 30% 31%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 32% 33% 34% 35% 36% 37% 38% 39%
## 0.0 0.0 43500.0 74571.4 95700.0 116000.0 118769.8 145000.0
## 40% 41% 42% 43% 44% 45% 46% 47%
## 150348.9 174000.0 184800.0 203700.0 216000.0 232000.0 240000.0 258000.0
## 48% 49% 50% 51% 52% 53% 54% 55%
## 284200.0 290000.0 290000.0 297478.3 312788.1 334276.0 346446.0 370968.9
## 56% 57% 58% 59% 60% 61% 62% 63%
## 392000.0 400305.1 410880.6 438368.6 460600.0 486710.1 490000.0 507349.8
## 64% 65% 66% 67% 68% 69% 70% 71%
## 528942.6 559000.0 602000.0 635938.0 652435.4 680000.0 727600.0 748420.0
## 72% 73% 74% 75% 76% 77% 78% 79%
## 789688.2 836217.2 860000.0 900898.7 935279.0 981092.5 1010402.6 1045573.2
## 80% 81% 82% 83% 84% 85% 86% 87%
## 1100000.0 1169898.4 1220208.2 1306574.5 1380618.1 1459464.8 1560829.4 1625384.1
## 88% 89% 90% 91% 92% 93% 94% 95%
## 1700700.2 1769856.3 1841142.1 2033524.0 2200101.2 2360000.0 2570558.7 2894792.0
## 96% 97% 98% 99% 100%
## 3178855.5 3683356.2 4743779.9 6010000.0 8156850.0
x <- subset(data$pc_cuotas_pagadas, data$pc_cuotas_pagadas <= 8156850.0)
hist(x)
pc_cuotas_pagadas <- data$pc_cuotas_pagadas
names(data)[52] # cpc_avg_proc_deuda: Promedio del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_avg_proc_deuda"
summary(data$cpc_avg_proc_deuda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -119.1284 0.0000 0.1820 0.3917 0.8733 13.8796
hist(data$cpc_avg_proc_deuda)
quantile(data$cpc_avg_proc_deuda, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4%
## -119.12840000 0.00000000 0.00000000 0.00000000 0.00000000
## 5% 6% 7% 8% 9%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 10% 11% 12% 13% 14%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 15% 16% 17% 18% 19%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 20% 21% 22% 23% 24%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 25% 26% 27% 28% 29%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 30% 31% 32% 33% 34%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 35% 36% 37% 38% 39%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 40% 41% 42% 43% 44%
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 45% 46% 47% 48% 49%
## 0.00317328 0.01566387 0.05337072 0.09155538 0.14000205
## 50% 51% 52% 53% 54%
## 0.18205318 0.22012374 0.25548425 0.28493140 0.32651283
## 55% 56% 57% 58% 59%
## 0.37242599 0.40897600 0.43696430 0.46846924 0.50734022
## 60% 61% 62% 63% 64%
## 0.53930356 0.56982057 0.59705567 0.63020920 0.65465374
## 65% 66% 67% 68% 69%
## 0.68039150 0.70248708 0.72589106 0.74845785 0.77251605
## 70% 71% 72% 73% 74%
## 0.79321046 0.81041248 0.82971103 0.84219350 0.85846768
## 75% 76% 77% 78% 79%
## 0.87328461 0.88886939 0.90253529 0.91428714 0.92869600
## 80% 81% 82% 83% 84%
## 0.94124096 0.95355144 0.96537928 0.97453072 0.98651367
## 85% 86% 87% 88% 89%
## 1.00066241 1.01072154 1.02341465 1.04034223 1.05742733
## 90% 91% 92% 93% 94%
## 1.07356220 1.08732512 1.09956045 1.11127967 1.12717757
## 95% 96% 97% 98% 99%
## 1.14653787 1.16522691 1.18974892 1.20539930 1.22587058
## 100%
## 13.87960000
names(data)[53] # cpc_sum_proc_deuda: Suma del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_sum_proc_deuda"
summary(data$cpc_sum_proc_deuda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -119.1284 0.0000 0.1820 0.3917 0.8733 13.8796
hist(data$cpc_sum_proc_deuda)
names(data)[54] # dc_porc_prod_sin_mora: Porcentaje de productos sin mora en todo el sistema
## [1] "dc_porc_prod_sin_mora"
summary(data$dc_porc_prod_sin_mora)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01288 0.00000 1.00000
hist(data$dc_porc_prod_sin_mora)
quantile(data$dc_porc_prod_sin_mora, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 96% 97% 98% 99% 100%
## 0 0 0 1 1
names(data)[55] # pc_ingreso_rutina_con_techo: Ingreso por rutina una vez aplicada los techos por segmento
## [1] "pc_ingreso_rutina_con_techo"
summary(data$pc_ingreso_rutina_con_techo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2122000 7576047 14120244 19076750 92269310
hist(data$pc_ingreso_rutina_con_techo)
pc_ingreso_rutina_con_techo <- data$pc_ingreso_rutina_con_techo
names(data)[56] # pc_saldo_prom3_tdc_entidad: Saldo promedio de los últimos 3 meses de tarjeta de crédito en el banco
## [1] "pc_saldo_prom3_tdc_entidad"
summary(data$pc_saldo_prom3_tdc_entidad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 92238 0 30110000
hist(data$pc_saldo_prom3_tdc_entidad)
names(data)[57] # dh_cant_salidas: Cantidad de trasacciones de salida de dinero en un mes
## [1] "dh_cant_salidas"
summary(data$dh_cant_salidas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 15.00 27.59 42.00 404.00
hist(data$dh_cant_salidas)
dh_cant_salidas <- data$dh_cant_salidas
names(data)[58] # dh_min_dia_pagos_d: Primer dia en el que realizó algún credito en el mes anterior
## [1] "dh_min_dia_pagos_d"
summary(data$dh_min_dia_pagos_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 2.000 3.952 5.000 31.000
hist(data$dh_min_dia_pagos_d)
dh_min_dia_pagos_d <- data$dh_min_dia_pagos_d
names(data)[59] # pc_ingreso_por_rutina: Ingreso por rutina
## [1] "pc_ingreso_por_rutina"
summary(data$pc_ingreso_por_rutina)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2122000 7576047 14120244 19076750 92269310
hist(data$pc_ingreso_por_rutina)
pc_ingreso_por_rutina <- data$pc_ingreso_por_rutina
names(data)[60] # dh_min_dia_pago_tarj_d: Primer dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_min_dia_pago_tarj_d"
hist(data$dh_min_dia_pago_tarj_d)
names(data)[61] # cp_nro_cuota: Numero de cuota pactadas por producto
## [1] "cp_nro_cuota"
summary(data$cp_nro_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[62] # dmi_egreso_total_mes: Egreso total del mes anterior
## [1] "dmi_egreso_total_mes"
summary(data$dmi_egreso_total_mes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 5735566 19552715 28140390 204098909
hist(data$dmi_egreso_total_mes)
quantile(data$dmi_egreso_total_mes, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 6% 7% 8% 9% 10% 11%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 12% 13% 14% 15% 16% 17%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 18% 19% 20% 21% 22% 23%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 24% 25% 26% 27% 28% 29%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 30% 31% 32% 33% 34% 35%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 36% 37% 38% 39% 40% 41%
## 20098.2 150464.5 469809.0 1007389.1 1364407.0 1757330.9
## 42% 43% 44% 45% 46% 47%
## 2139132.8 2597761.0 3055581.0 3491431.8 3875658.7 4341015.8
## 48% 49% 50% 51% 52% 53%
## 4728541.2 5246081.5 5735566.0 6222528.9 6750806.4 7463828.4
## 54% 55% 56% 57% 58% 59%
## 8156806.7 8903706.2 9676725.2 10353244.3 10949690.0 11653537.0
## 60% 61% 62% 63% 64% 65%
## 12507194.0 13308026.2 14158401.7 14949690.7 15668673.0 16852909.0
## 66% 67% 68% 69% 70% 71%
## 17823161.3 18701866.2 19649493.0 20633345.1 21568478.0 22936048.2
## 72% 73% 74% 75% 76% 77%
## 24164720.0 25304132.1 26608243.8 28140390.0 29543323.8 30673503.0
## 78% 79% 80% 81% 82% 83%
## 32490727.9 34304664.0 36063171.0 37668819.0 39804887.1 41664563.1
## 84% 85% 86% 87% 88% 89%
## 43936740.2 46612353.5 48890887.4 50950735.5 52832299.8 55616061.5
## 90% 91% 92% 93% 94% 95%
## 58941776.5 62642842.1 66264618.4 70686064.8 76333370.2 82037209.0
## 96% 97% 98% 99% 100%
## 90014943.2 99504343.8 113054665.2 132338269.6 204098909.0
x <- subset(data$dmi_egreso_total_mes, data$dmi_egreso_total_mes <= 209829245.0)
hist(x)
dmi_egreso_total_mes <- data$dmi_egreso_total_mes
names(data)[63] # cp_valor_inicial: valor inicial de la obligacion por producto
## [1] "cp_valor_inicial"
summary(data$cp_valor_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[64] # dh_max_dia_otros_d: Ultimo dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_max_dia_otros_d"
hist(data$dh_max_dia_otros_d)
x <- ifelse(data$dh_max_dia_otros_d <= 15, 1, 0)
table(x)
## x
## 0 1
## 7892 10154
flax_maxDiaOtrosD_1quincena <- x
names(data)[65] # p_cuota_sobre_saldo: Valor de la cuota sobre saldo por producto
## [1] "cp_cuota_sobre_saldo"
summary(data$cp_cuota_sobre_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[66] # dc_sum_valor_inicial: Suma valores iniciales de obligaciones en todo el sistema financiero
## [1] "dc_sum_valor_inicial"
summary(data$dc_sum_valor_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 602639 0 498786000
hist(data$dc_sum_valor_inicial)
names(data)[67] # dh_cant_pago_tarj_d: Cantidad de trasacciones de salida por concepto de pago de tarjeta de credito
## [1] "dh_cant_pago_tarj_d"
summary(data$dh_cant_pago_tarj_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.8238 1.0000 33.0000
hist(data$dh_cant_pago_tarj_d)
names(data)[68] # dh_max_dia_pagos_d: Ultimo dia en el que realizó algún credito en el mes anterior
## [1] "dh_max_dia_pagos_d"
hist(data$dh_max_dia_pagos_d)
x <- ifelse(data$dh_max_dia_pagos_d <= 15, 1, 0)
table(x)
## x
## 0 1
## 9762 8284
flag_maxDiaPagos_d_1quincena <- x
names(data)[69] # cp_saldo_sobre_inicial: valor del saldo sobre inicial por producto mes anterior
## [1] "cp_saldo_sobre_inicial"
summary(data$cp_saldo_sobre_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[70] # pc_mediana_nom3: Mediana de los últimos 3 pagos nómina para cálculo de retanqueo libranza
## [1] "pc_mediana_nom3"
summary(data$pc_mediana_nom3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 72055 0 64345000
hist(data$pc_mediana_nom3)
names(data)[71] # cp_esta_cuota_otro: Es el estado de la cuota otro
## [1] "cp_esta_cuota_otro"
summary(data$cp_esta_cuota_otro)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[72] # dh_max_dia_retiros_d: Ultimo dia en el que realizó reitro en el mes anterior
## [1] "dh_max_dia_retiros_d"
hist(data$dh_max_dia_retiros_d)
# variable incorrecta, el cuartil 3 es 30 y un mes tiene hasta 31 días
names(data)[73] # dh_avg_dia_entradas: Dia promedio del mes en el recibe las entradas de dinero
## [1] "dh_avg_dia_entradas"
summary(data$dh_avg_dia_entradas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 4.161 3.892 6.125 31.000
hist(data$dh_avg_dia_entradas)
names(data)[74] # dh_avg_dia_pagos_d: Dia del mes promedio en el que hace las salidas de dinero por pagos
## [1] "dh_avg_dia_pagos_d"
x <- ifelse(data$dh_avg_dia_pagos_d <= 10, 1, 0)
table(x)
## x
## 0 1
## 45 18001
names(data)[75] # dh_val_salidas: Valor total de las trasacciones de salida en un mes
## [1] "dh_val_salidas"
summary(data$dh_val_salidas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 5884591 19619223 28254748 204098909
hist(data$dh_val_salidas)
quantile(data$dh_val_salidas, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 6% 7% 8% 9% 10% 11%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 12% 13% 14% 15% 16% 17%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 18% 19% 20% 21% 22% 23%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 24% 25% 26% 27% 28% 29%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 30% 31% 32% 33% 34% 35%
## 0.0 0.0 0.0 0.0 11359.0 12350.0
## 36% 37% 38% 39% 40% 41%
## 76400.8 342377.0 679878.3 1118683.2 1541005.0 1938851.1
## 42% 43% 44% 45% 46% 47%
## 2298733.0 2740303.9 3183321.0 3592226.5 4022760.0 4466078.0
## 48% 49% 50% 51% 52% 53%
## 4917904.6 5332810.0 5884591.0 6356795.0 6873811.8 7663007.0
## 54% 55% 56% 57% 58% 59%
## 8257288.2 9064578.0 9809325.0 10475107.3 11023429.0 11765300.4
## 60% 61% 62% 63% 64% 65%
## 12624035.0 13404973.0 14242719.6 15034490.1 15802825.6 16946976.0
## 66% 67% 68% 69% 70% 71%
## 17905644.0 18764929.0 19700935.4 20697842.0 21641359.5 23028731.0
## 72% 73% 74% 75% 76% 77%
## 24249542.8 25385925.3 26679698.7 28254747.5 29644945.6 30871852.0
## 78% 79% 80% 81% 82% 83%
## 32539082.0 34463859.1 36134965.0 37813640.0 39947167.0 41741354.8
## 84% 85% 86% 87% 88% 89%
## 43999606.0 46745102.2 48942396.4 50977400.0 52974824.8 55714561.0
## 90% 91% 92% 93% 94% 95%
## 59028184.5 62668751.4 66304736.6 70688799.0 76451589.0 82043497.0
## 96% 97% 98% 99% 100%
## 90022669.6 99748689.3 113054665.2 132338269.6 204098909.0
x <- subset(data$dh_val_salidas, data$dh_val_salidas <= 217011100)
hist(x)
dh_val_salidas <- data$dh_val_salidas
names(data)[76] # dc_sum_valor_cuota: Valor total de las trasacciones de salida en un mes
## [1] "dc_sum_valor_cuota"
summary(data$dc_sum_valor_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 26756 0 10504000
hist(data$dc_sum_valor_cuota)
names(data)[77] # dh_min_dia_tras_d: Primer dia en el que realizó pago de traslado en el mes anterior
## [1] "dh_min_dia_tras_d"
hist(data$dh_min_dia_tras_d)
x <- ifelse(data$dh_min_dia_tras_d <= 10, 1, 0)
table(x)
## x
## 0 1
## 1640 16406
names(data)[78] # cp_porc_valorcuot_ing: Relación entre el valor de la cuota sobre los ingresos por producto
## [1] "cp_porc_valorcuot_ing"
summary(data$cp_porc_valorcuot_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[79] # pc_ind_ajustado: Ingreso neto disponible del cliente ajustado
## [1] "pc_ind_ajustado"
summary(data$pc_ind_ajustado)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6478753 319157 3173260 6474900 8958133 59465796
hist(data$pc_ind_ajustado)
quantile(data$pc_ind_ajustado, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -6478752.60 -1584635.43 -736622.04 -363284.48 -136329.26 -28157.31
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 10854.35 95242.50 166275.00
## 24% 25% 26% 27% 28% 29%
## 229297.05 319157.46 414630.00 515272.66 656092.98 773018.33
## 30% 31% 32% 33% 34% 35%
## 851723.71 955501.35 1106847.00 1197787.50 1316281.58 1374374.25
## 36% 37% 38% 39% 40% 41%
## 1463705.35 1558277.54 1644887.26 1739666.44 1860434.10 1979169.46
## 42% 43% 44% 45% 46% 47%
## 2082131.05 2210792.04 2333189.63 2474835.75 2578853.57 2661750.00
## 48% 49% 50% 51% 52% 53%
## 2819938.87 2966919.38 3173260.41 3314359.51 3431853.07 3568431.40
## 54% 55% 56% 57% 58% 59%
## 3786186.59 4016250.00 4159682.15 4360409.42 4488738.16 4630921.05
## 60% 61% 62% 63% 64% 65%
## 4760138.69 4974054.87 5297034.28 5504701.54 5846234.27 6117532.40
## 66% 67% 68% 69% 70% 71%
## 6380682.46 6673624.40 6940216.30 7237858.50 7635737.48 7844523.75
## 72% 73% 74% 75% 76% 77%
## 8066551.78 8352009.55 8678789.30 8958132.53 9306000.90 9765300.34
## 78% 79% 80% 81% 82% 83%
## 10257365.98 10770282.61 11156934.11 11599371.56 12020464.91 12773224.12
## 84% 85% 86% 87% 88% 89%
## 13320885.71 13937072.75 14681655.47 15506136.16 16123060.47 17059105.95
## 90% 91% 92% 93% 94% 95%
## 17978599.46 19117837.97 20277254.42 22345186.29 24072384.41 25778069.55
## 96% 97% 98% 99% 100%
## 28130216.60 30465696.15 33894017.04 38331857.83 59465796.00
x <- subset(data$pc_ind_ajustado, data$pc_ind_ajustado <= 68107989.8)
hist(x)
names(data)[80] # dh_val_retiros_d: Valor total de las trasacciones de salida por concepto de retiros en un mes
## [1] "dh_val_retiros_d"
summary(data$dh_val_retiros_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 1235198 11122177 13014757 197776495
hist(data$dh_val_retiros_d)
names(data)[81] # pc_tiem_lt_prod_abie_total: Tiempo último producto abierto
## [1] "pc_tiem_lt_prod_abie_total"
summary(data$pc_tiem_lt_prod_abie_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0778 0.0000 58.0000
hist(data$pc_tiem_lt_prod_abie_total)
names(data)[82] # marca_info_cifin_decode: Marca Cifin(Consultado, no consultado, no encontrado, etc)
## [1] "marca_info_cifin_decode"
table(data$marca_info_cifin_decode)
##
## 0 1 2
## 12258 252 5536
x <- ifelse(data$marca_info_cifin_decode == 0, 1, 0)
table(x)
## x
## 0 1
## 5788 12258
flag_encontrado_cifin <- x
names(data)[83] # dh_max_dia_pago_tarj_d: Ultimo dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_max_dia_pago_tarj_d"
hist(data$dh_max_dia_pago_tarj_d)
names(data)[84] # dc_valobli_ing: Suma del valor inicial de las obligaciones en el sistema financiero sobre los ingresos
## [1] "dc_valobli_ing"
summary(data$dc_valobli_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05584 0.00000 58.56358
hist(data$dc_valobli_ing)
names(data)[85] # pc_cantidad_tdc_entidad: Cantidad tarjetas de crédito en el banco
## [1] "pc_cantidad_tdc_entidad"
table(data$pc_cantidad_tdc_entidad)
##
## 0 1 2 3 4
## 17833 93 56 13 51
names(data)[86] # dh_min_dia_otros_d: Primer dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_min_dia_otros_d"
hist(data$dh_min_dia_otros_d)
x <- ifelse(data$dh_min_dia_otros_d <= 15, 1, 0)
table(x)
## x
## 0 1
## 1976 16070
names(data)[87] # dc_cant_obligaciones: Cantidad de obligaciones
## [1] "dc_cant_obligaciones"
table(data$dc_cant_obligaciones)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 13
## 17809 18 10 32 22 23 23 10 62 8 2 11 1
## 14 15 17 18 20
## 1 4 3 6 1
summary(data$dc_cant_obligaciones)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08423 0.00000 20.00000
hist(data$dc_cant_obligaciones)
x <- ifelse(data$dc_cant_obligaciones >= 1, 1, 0)
table(x)
## x
## 0 1
## 17809 237
names(data)[88] # cpc_sum_nro_cuota: Suma de los número de cuotas de todas las obligaciones del cliente
## [1] "cpc_sum_nro_cuota"
summary(data$cpc_sum_nro_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 20.00 78.43 107.00 1108.00
names(data)[89] # cpc_avg_nro_cuota: Promedio del número de cuotas entre todos los productos del cliente
## [1] "cpc_avg_nro_cuota"
summary(data$cpc_avg_nro_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 14.00 34.64 51.00 446.00
hist(data$cpc_avg_nro_cuota)
cpc_avg_nro_cuota <- data$cpc_avg_nro_cuota
names(data)[90] # cpc_max_nro_cuota: Número de cuotas máximo de un cliente entre todos sus productos
## [1] "cpc_max_nro_cuota"
names(data)[91] # cp_saldo: Valor del saldo del mes anterior por producto
## [1] "cp_saldo"
summary(data$cp_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[92] # cp_cuota_sobre_inicial: valor de la cuota sobre el valor desembolsado incial por producto
## [1] "cp_cuota_sobre_inicial"
summary(data$cp_cuota_sobre_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[93] # cpc_sum_saldo: Suma del saldo de todos los productos activos del cliente
## [1] "cpc_sum_saldo"
summary(data$cpc_sum_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1129268 0 1108448 11329613 8989011 220890667
hist(data$cpc_sum_saldo)
quantile(data$cpc_sum_saldo, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -1129268.0 0.0 0.0 0.0 0.0 0.0
## 6% 7% 8% 9% 10% 11%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 12% 13% 14% 15% 16% 17%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 18% 19% 20% 21% 22% 23%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 24% 25% 26% 27% 28% 29%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 30% 31% 32% 33% 34% 35%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 36% 37% 38% 39% 40% 41%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 42% 43% 44% 45% 46% 47%
## 0.0 0.0 0.0 33950.0 110565.4 348394.0
## 48% 49% 50% 51% 52% 53%
## 613901.0 900660.1 1108447.5 1439348.3 1743853.2 2009279.1
## 54% 55% 56% 57% 58% 59%
## 2230500.0 2464656.0 2738263.0 3022119.5 3254198.7 3529597.0
## 60% 61% 62% 63% 64% 65%
## 3829456.0 4079343.0 4345954.0 4611392.3 4843450.0 5057907.0
## 66% 67% 68% 69% 70% 71%
## 5295887.0 5527366.0 5700842.4 5994608.2 6311746.0 6793297.0
## 72% 73% 74% 75% 76% 77%
## 7411579.2 7832990.0 8379714.0 8989011.2 9577423.2 10105086.2
## 78% 79% 80% 81% 82% 83%
## 10746466.0 11413477.4 12158710.0 12852364.0 14071157.1 14697477.0
## 84% 85% 86% 87% 88% 89%
## 15465740.8 16480607.0 17754309.9 19526399.3 21521641.0 23601753.0
## 90% 91% 92% 93% 94% 95%
## 27460463.5 32852491.0 39550370.6 45636780.7 56445784.0 68892210.8
## 96% 97% 98% 99% 100%
## 79424456.4 96412554.3 116216129.4 145505224.0 220890667.4
x <- subset(data$cpc_sum_saldo, data$cpc_sum_saldo <= 220905770.00)
hist(x)
cpc_sum_saldo <- data$cpc_sum_saldo
names(data)[94] # cp_porc_saldo_ing: valor del saldo sobre ingreso por producto
## [1] "cp_porc_saldo_ing"
summary(data$cp_porc_saldo_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[95] # gsm_mejor_gestion: Mejor gestión realizada en el mes anterior
## [1] "gsm_mejor_gestion"
table(data$gsm_mejor_gestion)
##
## 0 16
## 18044 2
names(data)[96] # dh_min_dia_nomina_c
## [1] "dh_min_dia_nomina_c"
names(data)[97] # dh_max_dia_nomina_c: Ultimo dia en el que recibió pago de nomina en el mes anterior
## [1] "dh_max_dia_nomina_c"
hist(data$dh_max_dia_nomina_c)
names(data)[98] # cp_valor_cuota: Valor de cuota por producto
## [1] "cp_valor_cuota"
summary(data$cp_valor_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[99] # cpc_nro_cuota_tdc: Suma de los número de cuotas de todas las tarjetas de crédito del cliente
## [1] "cpc_nro_cuota_tdc"
names(data)[100] # gsm_prom_dias_gest: Promedio de los días en que se realizaron gestiones en el mes anterior
## [1] "gsm_prom_dias_gest"
names(data)[101] # pc_cuota_no_rot_ent: Cuota de productos no rotativos en el banco
## [1] "pc_cuota_no_rot_ent"
summary(data$pc_cuota_no_rot_ent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 23699 0 6342000
hist(data$pc_cuota_no_rot_ent)
names(data)[102] # dh_val_nomina_c: Valor total de las trasacciones de entrada por concepto de nomina en un mes
## [1] "dh_val_nomina_c"
summary(data$dh_val_nomina_c)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 101166 0 42000000
hist(data$dh_val_nomina_c)
quantile(data$dh_val_nomina_c, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6% 7%
## 0 0 0 0 0 0 0 0
## 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23%
## 0 0 0 0 0 0 0 0
## 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 0 0 0 0 0 0
## 32% 33% 34% 35% 36% 37% 38% 39%
## 0 0 0 0 0 0 0 0
## 40% 41% 42% 43% 44% 45% 46% 47%
## 0 0 0 0 0 0 0 0
## 48% 49% 50% 51% 52% 53% 54% 55%
## 0 0 0 0 0 0 0 0
## 56% 57% 58% 59% 60% 61% 62% 63%
## 0 0 0 0 0 0 0 0
## 64% 65% 66% 67% 68% 69% 70% 71%
## 0 0 0 0 0 0 0 0
## 72% 73% 74% 75% 76% 77% 78% 79%
## 0 0 0 0 0 0 0 0
## 80% 81% 82% 83% 84% 85% 86% 87%
## 0 0 0 0 0 0 0 0
## 88% 89% 90% 91% 92% 93% 94% 95%
## 0 0 0 0 0 0 0 0
## 96% 97% 98% 99% 100%
## 0 0 600000 2173865 42000000
names(data)[103] # banca_completa: Si el cliente pertenece al segmento banca completa o no
## [1] "banca_completa"
table(data$banca_completa)
##
## 0
## 18046
names(data)[104] # cpc_saldo_sobre_ing: saldo sobre ingreso por cliente
## [1] "cpc_saldo_sobre_ing"
summary(data$cpc_saldo_sobre_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.07529 0.00000 0.03420 0.70105 0.50141 20.23908
hist(data$cpc_saldo_sobre_ing)
quantile(data$cpc_saldo_sobre_ing, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -0.075284533 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 6% 7% 8% 9% 10% 11%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 12% 13% 14% 15% 16% 17%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 18% 19% 20% 21% 22% 23%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 24% 25% 26% 27% 28% 29%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 30% 31% 32% 33% 34% 35%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 36% 37% 38% 39% 40% 41%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 42% 43% 44% 45% 46% 47%
## 0.000000000 0.000000000 0.000000000 0.001327638 0.003920323 0.010485818
## 48% 49% 50% 51% 52% 53%
## 0.017391611 0.026439748 0.034204096 0.042727976 0.053022024 0.064025658
## 54% 55% 56% 57% 58% 59%
## 0.072661546 0.082753374 0.095978925 0.109599685 0.119360811 0.133522179
## 60% 61% 62% 63% 64% 65%
## 0.145511888 0.153764215 0.165604222 0.180229507 0.195539646 0.214586774
## 66% 67% 68% 69% 70% 71%
## 0.239741468 0.260049473 0.280311386 0.297297228 0.321735771 0.349650555
## 72% 73% 74% 75% 76% 77%
## 0.378458794 0.413185699 0.459745553 0.501406376 0.534302699 0.570659962
## 78% 79% 80% 81% 82% 83%
## 0.614790560 0.670961859 0.723906664 0.810553439 0.862300980 0.949874583
## 84% 85% 86% 87% 88% 89%
## 1.034139144 1.126322423 1.224357805 1.353146272 1.492494873 1.647897690
## 90% 91% 92% 93% 94% 95%
## 1.813319044 1.978121143 2.180685275 2.429309111 2.848231115 3.195820189
## 96% 97% 98% 99% 100%
## 4.530609759 6.130061812 7.594766901 9.837172114 20.239077899
x <- subset(data$cpc_saldo_sobre_ing, data$cpc_saldo_sobre_ing <= 20.307294513)
hist(x)
cpc_saldo_sobre_ing <- data$cpc_saldo_sobre_ing
names(data)[105] # dh_min_dia_pago_cred_d: Primer dia en el que realizó pago de credito en el mes anterior
## [1] "dh_min_dia_pago_cred_d"
hist(data$dh_min_dia_pago_cred_d)
dh_min_dia_pago_cred_d <- data$dh_min_dia_pago_cred_d
names(data)[106] # cpc_saldo_tdc: Saldo total en tarjetas de crédito del cliente
## [1] "cpc_saldo_tdc"
summary(data$cpc_saldo_tdc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1129268 0 318800 4323266 5989105 42938793
hist(data$cpc_saldo_tdc)
quantile(data$cpc_saldo_tdc, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6%
## -1129268.0 -45.1 0.0 0.0 0.0 0.0 0.0
## 7% 8% 9% 10% 11% 12% 13%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 14% 15% 16% 17% 18% 19% 20%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 21% 22% 23% 24% 25% 26% 27%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 28% 29% 30% 31% 32% 33% 34%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 35% 36% 37% 38% 39% 40% 41%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 42% 43% 44% 45% 46% 47% 48%
## 0.0 0.0 0.0 0.0 0.0 4536.0 39499.0
## 49% 50% 51% 52% 53% 54% 55%
## 131904.3 318800.0 558948.6 796776.6 1028169.0 1249210.2 1555273.0
## 56% 57% 58% 59% 60% 61% 62%
## 1809553.0 2020438.0 2214443.8 2427540.6 2635538.0 2846335.5 3087026.0
## 63% 64% 65% 66% 67% 68% 69%
## 3329693.5 3592291.4 3829456.0 4031553.2 4257540.0 4491031.6 4722643.2
## 70% 71% 72% 73% 74% 75% 76%
## 4932761.5 5111416.0 5366764.6 5540825.0 5735475.3 5989105.0 6209213.4
## 77% 78% 79% 80% 81% 82% 83%
## 6643593.0 7185642.5 7609331.2 8034478.0 8497649.0 9048885.3 9573365.3
## 84% 85% 86% 87% 88% 89% 90%
## 10016976.0 10596398.2 11109566.0 11723089.0 12303712.2 13004694.4 14105014.5
## 91% 92% 93% 94% 95% 96% 97%
## 14697477.0 15541141.0 16460852.1 17506625.0 19008508.5 20708956.0 22249488.6
## 98% 99% 100%
## 25956592.3 30683139.0 42938793.0
x <- subset(data$cpc_saldo_tdc, data$cpc_saldo_tdc <= 43186883.96)
hist(x)
cpc_saldo_tdc <- data$cpc_saldo_tdc
names(data)[107] # pc_cuota_de_consumo: Cuota de crédito de consumo reportada por CIFIN
## [1] "pc_cuota_de_consumo"
summary(data$pc_cuota_de_consumo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 22909 0 6342000
hist(data$pc_cuota_de_consumo)
names(data)[108] # dh_cant_tras_d: Cantidad de traslados de salida de dinero en un mes
## [1] "dh_cant_tras_d"
hist(data$dh_cant_tras_d)
quantile(data$dh_cant_tras_d, seq(0,1,by=0.005))
## 0.0% 0.5% 1.0% 1.5% 2.0% 2.5% 3.0% 3.5% 4.0% 4.5% 5.0%
## 0 0 0 0 0 0 0 0 0 0 0
## 5.5% 6.0% 6.5% 7.0% 7.5% 8.0% 8.5% 9.0% 9.5% 10.0% 10.5%
## 0 0 0 0 0 0 0 0 0 0 0
## 11.0% 11.5% 12.0% 12.5% 13.0% 13.5% 14.0% 14.5% 15.0% 15.5% 16.0%
## 0 0 0 0 0 0 0 0 0 0 0
## 16.5% 17.0% 17.5% 18.0% 18.5% 19.0% 19.5% 20.0% 20.5% 21.0% 21.5%
## 0 0 0 0 0 0 0 0 0 0 0
## 22.0% 22.5% 23.0% 23.5% 24.0% 24.5% 25.0% 25.5% 26.0% 26.5% 27.0%
## 0 0 0 0 0 0 0 0 0 0 0
## 27.5% 28.0% 28.5% 29.0% 29.5% 30.0% 30.5% 31.0% 31.5% 32.0% 32.5%
## 0 0 0 0 0 0 0 0 0 0 0
## 33.0% 33.5% 34.0% 34.5% 35.0% 35.5% 36.0% 36.5% 37.0% 37.5% 38.0%
## 0 0 0 0 0 0 0 0 0 0 0
## 38.5% 39.0% 39.5% 40.0% 40.5% 41.0% 41.5% 42.0% 42.5% 43.0% 43.5%
## 0 0 0 0 0 0 0 0 0 0 0
## 44.0% 44.5% 45.0% 45.5% 46.0% 46.5% 47.0% 47.5% 48.0% 48.5% 49.0%
## 0 0 0 0 0 0 0 0 0 0 0
## 49.5% 50.0% 50.5% 51.0% 51.5% 52.0% 52.5% 53.0% 53.5% 54.0% 54.5%
## 0 0 0 0 0 0 0 0 0 0 0
## 55.0% 55.5% 56.0% 56.5% 57.0% 57.5% 58.0% 58.5% 59.0% 59.5% 60.0%
## 0 0 0 0 0 0 0 0 0 0 0
## 60.5% 61.0% 61.5% 62.0% 62.5% 63.0% 63.5% 64.0% 64.5% 65.0% 65.5%
## 0 0 0 0 0 0 0 0 0 0 1
## 66.0% 66.5% 67.0% 67.5% 68.0% 68.5% 69.0% 69.5% 70.0% 70.5% 71.0%
## 1 1 1 1 1 1 1 1 1 1 1
## 71.5% 72.0% 72.5% 73.0% 73.5% 74.0% 74.5% 75.0% 75.5% 76.0% 76.5%
## 1 1 2 2 2 2 2 2 2 2 2
## 77.0% 77.5% 78.0% 78.5% 79.0% 79.5% 80.0% 80.5% 81.0% 81.5% 82.0%
## 3 3 3 3 3 3 3 4 4 4 4
## 82.5% 83.0% 83.5% 84.0% 84.5% 85.0% 85.5% 86.0% 86.5% 87.0% 87.5%
## 4 4 5 5 5 5 6 6 6 6 7
## 88.0% 88.5% 89.0% 89.5% 90.0% 90.5% 91.0% 91.5% 92.0% 92.5% 93.0%
## 7 8 8 8 9 9 9 10 10 11 11
## 93.5% 94.0% 94.5% 95.0% 95.5% 96.0% 96.5% 97.0% 97.5% 98.0% 98.5%
## 12 12 13 14 14 15 16 17 18 19 20
## 99.0% 99.5% 100.0%
## 23 25 31
x <- subset(data$dh_cant_tras_d, data$dh_cant_tras_d <= 31)
hist(x)
dh_cant_tras_d <- data$dh_cant_tras_d
names(data)[109] # dh_max_dia_comisio_d
## [1] "dh_max_dia_comisio_d"
names(data)[110] # cpc_avg_saldo: Promedio del saldo de las obligaciones del cliente en el mes anterior
## [1] "cpc_avg_saldo"
summary(data$cpc_avg_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1011469 0 774422 5152861 4793250 106882802
hist(data$cpc_avg_saldo)
quantile(data$cpc_avg_saldo, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -1011469.0 0.0 0.0 0.0 0.0 0.0
## 6% 7% 8% 9% 10% 11%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 12% 13% 14% 15% 16% 17%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 18% 19% 20% 21% 22% 23%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 24% 25% 26% 27% 28% 29%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 30% 31% 32% 33% 34% 35%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 36% 37% 38% 39% 40% 41%
## 0.0 0.0 0.0 0.0 0.0 0.0
## 42% 43% 44% 45% 46% 47%
## 0.0 0.0 0.0 16975.0 69850.0 206899.0
## 48% 49% 50% 51% 52% 53%
## 365970.0 565479.8 774421.5 940160.4 1053110.0 1202192.1
## 54% 55% 56% 57% 58% 59%
## 1358774.5 1517750.0 1668502.5 1818995.1 1946590.0 2064736.0
## 60% 61% 62% 63% 64% 65%
## 2199797.0 2328455.4 2429205.4 2528953.5 2676266.0 2793238.0
## 66% 67% 68% 69% 70% 71%
## 2973823.1 3124962.0 3305366.2 3512956.1 3720362.0 3908560.0
## 72% 73% 74% 75% 76% 77%
## 4146008.0 4315487.0 4534635.6 4793250.0 4927245.0 5105438.0
## 78% 79% 80% 81% 82% 83%
## 5327563.0 5540825.0 5755606.8 6023367.8 6305529.5 6769565.3
## 84% 85% 86% 87% 88% 89%
## 7375373.0 7884712.2 8498791.6 9327747.6 10567824.0 11633631.6
## 90% 91% 92% 93% 94% 95%
## 13229350.7 15008544.5 17404416.0 19849268.4 21885903.6 25137925.4
## 96% 97% 98% 99% 100%
## 33473766.1 40065113.3 47628017.1 66073881.0 106882801.8
x <- subset(data$cpc_avg_saldo, data$cpc_avg_saldo <= 106884393.50)
hist(x)
cpc_avg_saldo <- data$cpc_avg_saldo
names(data)[111] # dc_max_saldo_sf: máximo saldo en el sistema financiero (Solo Financiero)
## [1] "dc_max_saldo_sf"
summary(data$dc_max_saldo_sf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 259660 0 182310000
names(data)[112] # dh_val_pago_tarj_d: Valor total de las trasacciones de salida por concepto de pagos de tdc en un mes
## [1] "dh_val_pago_tarj_d"
summary(data$dh_val_pago_tarj_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 427965 310071 10559486
hist(data$dh_val_pago_tarj_d)
quantile(data$dh_val_pago_tarj_d, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 7% 8% 9% 10% 11% 12% 13%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 14% 15% 16% 17% 18% 19% 20%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 21% 22% 23% 24% 25% 26% 27%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 28% 29% 30% 31% 32% 33% 34%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 35% 36% 37% 38% 39% 40% 41%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 42% 43% 44% 45% 46% 47% 48%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 49% 50% 51% 52% 53% 54% 55%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 56% 57% 58% 59% 60% 61% 62%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 63% 64% 65% 66% 67% 68% 69%
## 0.0 0.0 0.0 6823.0 20000.0 34674.4 61262.1
## 70% 71% 72% 73% 74% 75% 76%
## 113870.0 154310.6 190467.0 230752.0 268506.0 310071.0 353957.8
## 77% 78% 79% 80% 81% 82% 83%
## 401450.0 440004.8 485001.7 549448.0 611841.9 681124.1 720438.2
## 84% 85% 86% 87% 88% 89% 90%
## 795860.0 864850.0 946432.5 1017867.0 1122199.0 1239723.7 1383143.5
## 91% 92% 93% 94% 95% 96% 97%
## 1511667.0 1671642.8 1883813.3 2088449.4 2368354.8 2818150.4 3187581.8
## 98% 99% 100%
## 3910841.7 5267074.4 10559486.0
x <- subset(data$dh_val_pago_tarj_d, data$dh_val_pago_tarj_d <= 10560315.0)
hist(x)
dh_val_pago_tarj_d <- data$dh_val_pago_tarj_d
names(data)[113] # pc_productos_no_rotativos_entidad: Cantidad de productos no rotativos en el banco
## [1] "pc_productos_no_rotativos_entidad"
table(data$pc_productos_no_rotativos_entidad)
##
## 0 1 2 3 4 5 6 9 18
## 17850 54 38 20 36 4 10 1 33
hist(data$pc_productos_no_rotativos_entidad)
names(data)[114] # pc_saldo_no_rot_ent: Obligaciones no rotativos en el banco
## [1] "pc_saldo_no_rot_ent"
summary(data$pc_saldo_no_rot_ent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 918154 0 229052000
names(data)[115] # pc_vi_no_rotativos_entidad: Obligaciones no rotativos en el banco
## [1] "pc_vi_no_rotativos_entidad"
summary(data$pc_vi_no_rotativos_entidad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 1033 0 233279
names(data)[116] # dh_min_dia_entradas
## [1] "dh_min_dia_entradas"
names(data)[117] # pcons_hipotecario_vivienda: Es el producto un hipotecario
## [1] "pcons_hipotecario_vivienda"
table(data$pcons_hipotecario_vivienda)
##
## 0 1
## 17500 546
pcons_hipotecario_vivienda <- data$pcons_hipotecario_vivienda
names(data)[118] # gsm_mejor_gestion_3m: Mejor gestión realizada en los últimos 3 meses
## [1] "gsm_mejor_gestion_3m"
table(data$gsm_mejor_gestion_3m)
##
## 0 16
## 18044 2
names(data)[119] # dh_avg_dia_salidas: Dia promedio del mes en el que hace las salidas de dinero
## [1] "dh_avg_dia_salidas"
names(data)[120] # y_auto_cura: Variable respuesta: si el cliente se autocuro o no
## [1] "y_auto_cura"
table(data$y_auto_cura)
##
## 0 1
## 5590 12456
Y <- data$y_auto_cura
names(data)[122] # segmentoestructural: Segmento estructural
## [1] "segmentoestructural"
table(data$segmentoestructural)
##
## PYME
## 18046
names(data)[123] # subsegmentoestructural
## [1] "subsegmentoestructural"
table(data$subsegmentoestructural)
##
## Pyme Pequena
## 18046
unificado <- as.data.frame(
cbind(
Y,
max_sem,
desv_sem,
prom_bim,
max_mes_anterior,
prom_mes_anterior,
prom_sem,
max_bim,
prom_trim,
pc_cant_moras_30_ult_12_meses,
desv_trim,
desv_bim,
dh_cant_entradas,
pc_transaccional,
dh_val_entradas,
pcons_tarjeta_de_credito,
dh_avg_dia_retiros_d,
dmi_max_egreso_diario,
dh_cant_otros_d,
dmi_max_ingreso_diario,
dh_val_otros_d,
pc_ingreso_final,
dh_cant_pagos_d,
dmi_ingreso_total_mes,
dh_val_pagos_d,
pc_gasto_familiar,
pc_cuotas_pagadas,
pc_ingreso_rutina_con_techo,
dh_cant_salidas,
dh_min_dia_pagos_d,
pc_ingreso_por_rutina,
dmi_egreso_total_mes,
dh_val_salidas,
cpc_avg_nro_cuota,
cpc_sum_saldo,
cpc_saldo_sobre_ing,
dh_min_dia_pago_cred_d,
cpc_saldo_tdc,
dh_cant_tras_d,
cpc_avg_saldo,
dh_val_pago_tarj_d,
pcons_hipotecario_vivienda,
flag_mora60_ult12meses,
flag_ultima_entrada_1quincena,
flag_tuvo_mora60_ult3meses,
flag_diaPago_1quincena,
flag_es_cluster_6,
flag_tuvo_mora90_ult12M,
flax_maxDiaOtrosD_1quincena,
flag_maxDiaPagos_d_1quincena,
flag_encontrado_cifin
)
)
dim(unificado)
## [1] 18046 51
str(unificado)
## 'data.frame': 18046 obs. of 51 variables:
## $ Y : num 0 0 0 0 0 0 1 1 1 1 ...
## $ max_sem : num 7 17 36 27 32 32 0 0 0 12 ...
## $ desv_sem : num 2.86 7.64 12.87 10.89 11.65 ...
## $ prom_bim : num 1.87 3.46 3.67 3.67 3.16 ...
## $ max_mes_anterior : num 7 17 27 27 20 27 0 0 0 12 ...
## $ prom_mes_anterior : num 2.65 4.12 5.2 5.2 4.47 ...
## $ prom_sem : num 1.17 8 21 4.83 20.83 ...
## $ max_bim : num 7 17 27 27 20 30 0 0 0 12 ...
## $ prom_trim : num 1.53 3.7 3.74 3 4.16 ...
## $ pc_cant_moras_30_ult_12_meses: num 1 1 0 1 0 0 0 0 0 0 ...
## $ desv_trim : num 2.01 2.4 3.68 3.95 4.02 ...
## $ desv_bim : num 2.22 2.66 4.37 4.37 3.76 ...
## $ dh_cant_entradas : num 0 3.74 2.24 0 1.41 ...
## $ pc_transaccional : num 3000000 7710300 6670935 3000000 3150000 ...
## $ dh_val_entradas : num 0 2110 1785 0 1000 ...
## $ pcons_tarjeta_de_credito : num 0 0 0 0 0 0 1 0 0 1 ...
## $ dh_avg_dia_retiros_d : num 0 16.6 22 0 16.2 ...
## $ dmi_max_egreso_diario : num 0 2061032 2131531 0 1002318 ...
## $ dh_cant_otros_d : num 0 1.41 1.41 0 0 ...
## $ dmi_max_ingreso_diario : num 0 2210300 1858600 0 1000000 ...
## $ dh_val_otros_d : num 0 22651 20200 0 0 ...
## $ pc_ingreso_final : num 3000000 6553755 6670935 3000000 2677500 ...
## $ dh_cant_pagos_d : num 0 8 1 0 4 1 0 10 13 0 ...
## $ dmi_ingreso_total_mes : num 0 4452485 3187600 0 1000000 ...
## $ dh_val_pagos_d : num 0 3085738 1 0 1978774 ...
## $ pc_gasto_familiar : num 765000 2313090 1701088 765000 945000 ...
## $ pc_cuotas_pagadas : num 0 0 0 0 0 ...
## $ pc_ingreso_rutina_con_techo : num 3000000 7710300 6670935 3000000 3150000 ...
## $ dh_cant_salidas : num 0 28 11 0 10 11 0 72 69 0 ...
## $ dh_min_dia_pagos_d : num 0 13 7 0 2 7 0 9 8 0 ...
## $ pc_ingreso_por_rutina : num 3000000 7710300 6670935 3000000 3150000 ...
## $ dmi_egreso_total_mes : num 0 4921015 3055581 0 1998986 ...
## $ dh_val_salidas : num 0 4921015 3055581 0 1998986 ...
## $ cpc_avg_nro_cuota : num 0 0 0 0 0 0 0 64 70 0 ...
## $ cpc_sum_saldo : num 0 0 0 0 0 ...
## $ cpc_saldo_sobre_ing : num 0 0 0 0 0 ...
## $ dh_min_dia_pago_cred_d : num 0 13 7 0 2 7 0 17 15 0 ...
## $ cpc_saldo_tdc : num 0 0 0 0 0 ...
## $ dh_cant_tras_d : num 0 0 0 0 0 0 0 4 2 0 ...
## $ cpc_avg_saldo : num 0 0 0 0 0 ...
## $ dh_val_pago_tarj_d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pcons_hipotecario_vivienda : num 0 0 0 0 0 0 0 0 1 0 ...
## $ flag_mora60_ult12meses : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flag_ultima_entrada_1quincena: num 1 0 0 1 0 0 1 0 0 1 ...
## $ flag_tuvo_mora60_ult3meses : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flag_diaPago_1quincena : num 1 1 1 0 0 0 1 0 0 0 ...
## $ flag_es_cluster_6 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ flag_tuvo_mora90_ult12M : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flax_maxDiaOtrosD_1quincena : num 1 0 0 1 1 0 1 0 0 1 ...
## $ flag_maxDiaPagos_d_1quincena : num 1 0 1 1 0 1 1 0 0 1 ...
## $ flag_encontrado_cifin : num 0 1 1 0 1 1 0 1 1 0 ...
Se utilizará la función nearZeroVar para eliminar variables con varianza cercana a 0 y la función findCorrelation para eliminar variables independientes con correlación >= x entre sí.
Y <- unificado$Y
unificado$Y <- NULL
X <- unificado
# nearZeroVar
i <- nearZeroVar(X)
# variables a omitir por no tener varianza
names(X)[i]
## [1] "dh_cant_salidas" "cpc_avg_nro_cuota"
## [3] "pcons_hipotecario_vivienda"
X <- X[,-i]
# FindCorrelation
i <- findCorrelation(cor(X),cutoff = 0.8)
# variables a omitir por tener alta correlación entre sí
names(X)[i]
## [1] "dh_val_entradas" "dh_val_salidas"
## [3] "dmi_egreso_total_mes" "dmi_ingreso_total_mes"
## [5] "flag_maxDiaPagos_d_1quincena" "flag_ultima_entrada_1quincena"
## [7] "dmi_max_egreso_diario" "pc_gasto_familiar"
## [9] "prom_trim" "pc_ingreso_rutina_con_techo"
## [11] "pc_transaccional" "pc_ingreso_final"
## [13] "prom_bim" "pc_cant_moras_30_ult_12_meses"
## [15] "max_bim" "max_sem"
## [17] "prom_sem" "flag_mora60_ult12meses"
## [19] "prom_mes_anterior" "desv_bim"
## [21] "cpc_sum_saldo"
X <- X[,-i]
# variables resultantes
salida <- cbind(Y, X)
dim(salida)
## [1] 18046 27
# Variables que quedan
names(salida)
## [1] "Y" "desv_sem"
## [3] "max_mes_anterior" "desv_trim"
## [5] "dh_cant_entradas" "pcons_tarjeta_de_credito"
## [7] "dh_avg_dia_retiros_d" "dh_cant_otros_d"
## [9] "dmi_max_ingreso_diario" "dh_val_otros_d"
## [11] "dh_cant_pagos_d" "dh_val_pagos_d"
## [13] "pc_cuotas_pagadas" "dh_min_dia_pagos_d"
## [15] "pc_ingreso_por_rutina" "cpc_saldo_sobre_ing"
## [17] "dh_min_dia_pago_cred_d" "cpc_saldo_tdc"
## [19] "dh_cant_tras_d" "cpc_avg_saldo"
## [21] "dh_val_pago_tarj_d" "flag_tuvo_mora60_ult3meses"
## [23] "flag_diaPago_1quincena" "flag_es_cluster_6"
## [25] "flag_tuvo_mora90_ult12M" "flax_maxDiaOtrosD_1quincena"
## [27] "flag_encontrado_cifin"
set.seed(1) # semilla para hacer reproducible el proceso
S <- sample(1:nrow(salida), round(nrow(salida)*0.8,0), replace = FALSE)
train <- salida[S,]
test <- salida[-S,]
# Verificación
nrow(salida) - nrow(train) - nrow(test)
## [1] 0
Se aplicará un scaling usando punteos Z (Z = (Xi - X) / S) a través de la función preprocess del páquete CARET. Es importante destacar que los parámetros del train dataset (media y desviación) serán usados para el scaling del test y oot dataset
# aplicar scaling de punteo Z para favorecer el aprendizaje de los modelos
Y <- train$Y
train$Y <- NULL
preprocesado <- preProcess(train, scale = TRUE, center = FALSE)
train <- predict(preprocesado, train)
train <- cbind(Y, train)
summary(train)
## Y desv_sem max_mes_anterior desv_trim
## Min. :0.0000 Min. :-0.8059 Min. :-0.5075 Min. :-1.19411
## 1st Qu.:0.0000 1st Qu.:-0.6508 1st Qu.:-0.5075 1st Qu.:-1.19411
## Median :1.0000 Median :-0.2743 Median :-0.3711 Median :-0.01921
## Mean :0.6916 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.:1.0000 3rd Qu.: 0.3089 3rd Qu.: 0.1062 3rd Qu.: 0.66893
## Max. :1.0000 Max. :25.0106 Max. :21.1784 Max. : 8.93227
## dh_cant_entradas pcons_tarjeta_de_credito dh_avg_dia_retiros_d
## Min. :-1.0603 Min. :-0.6752 Min. :-1.2211
## 1st Qu.:-1.0603 1st Qu.:-0.6752 1st Qu.:-1.2211
## Median :-0.1007 Median :-0.6752 Median : 0.3910
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7091 3rd Qu.: 1.4809 3rd Qu.: 0.7896
## Max. : 3.9993 Max. : 1.4809 Max. : 3.5566
## dh_cant_otros_d dmi_max_ingreso_diario dh_val_otros_d dh_cant_pagos_d
## Min. :-0.8645 Min. :-0.5939 Min. :-0.4234 Min. :-0.7778
## 1st Qu.:-0.8645 1st Qu.:-0.5939 1st Qu.:-0.4234 1st Qu.:-0.7778
## Median :-0.1101 Median :-0.4010 Median :-0.4165 Median :-0.2944
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6443 3rd Qu.: 0.1653 3rd Qu.:-0.0938 3rd Qu.: 0.5113
## Max. : 5.5814 Max. : 7.2274 Max. : 7.7619 Max. : 6.3123
## dh_val_pagos_d pc_cuotas_pagadas dh_min_dia_pagos_d pc_ingreso_por_rutina
## Min. :-0.5468 Min. :-0.6194 Min. :-0.6552 Min. :-0.8032
## 1st Qu.:-0.5468 1st Qu.:-0.6194 1st Qu.:-0.6552 1st Qu.:-0.6827
## Median :-0.4491 Median :-0.3631 Median :-0.3244 Median :-0.3695
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.1372 3rd Qu.: 0.1808 3rd Qu.: 0.1718 3rd Qu.: 0.2769
## Max. : 8.6602 Max. : 6.5910 Max. : 4.4722 Max. : 4.4212
## cpc_saldo_sobre_ing dh_min_dia_pago_cred_d cpc_saldo_tdc
## Min. :-0.4104 Min. :-0.6821 Min. :-0.7940
## 1st Qu.:-0.3699 1st Qu.:-0.6821 1st Qu.:-0.6288
## Median :-0.3519 Median :-0.5430 Median :-0.5864
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.1032 3rd Qu.: 0.2916 3rd Qu.: 0.2453
## Max. :10.5052 Max. : 3.6301 Max. : 5.6517
## dh_cant_tras_d cpc_avg_saldo dh_val_pago_tarj_d
## Min. :-0.47777 Min. :-0.5237 Min. :-0.3933
## 1st Qu.:-0.47777 1st Qu.:-0.4361 1st Qu.:-0.3933
## Median :-0.47777 Median :-0.3738 Median :-0.3933
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.06338 3rd Qu.:-0.0262 3rd Qu.:-0.1123
## Max. : 5.94527 Max. : 8.8241 Max. : 9.4425
## flag_tuvo_mora60_ult3meses flag_diaPago_1quincena flag_es_cluster_6
## Min. :-0.3841 Min. :-1.5585 Min. :-1.0087
## 1st Qu.:-0.3841 1st Qu.:-1.5585 1st Qu.:-1.0087
## Median :-0.3841 Median : 0.6416 Median : 0.9913
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.3841 3rd Qu.: 0.6416 3rd Qu.: 0.9913
## Max. : 2.6035 Max. : 0.6416 Max. : 0.9913
## flag_tuvo_mora90_ult12M flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
## Min. :-0.2625 Min. :-1.1344 Min. :-1.4521
## 1st Qu.:-0.2625 1st Qu.:-1.1344 1st Qu.:-1.4521
## Median :-0.2625 Median : 0.8815 Median : 0.6886
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2625 3rd Qu.: 0.8815 3rd Qu.: 0.6886
## Max. : 3.8087 Max. : 0.8815 Max. : 0.6886
# Aplicar scaling al test dataset
# se aplicará usando las medias y desviación del train dataset
Y <- test$Y
test$Y <- NULL
test <- predict(preprocesado, test)
test <- cbind(Y, test)
summary(test)
## Y desv_sem max_mes_anterior desv_trim
## Min. :0.0000 Min. :-0.805923 Min. :-0.50753 Min. :-1.19411
## 1st Qu.:0.0000 1st Qu.:-0.642950 1st Qu.:-0.50753 1st Qu.:-1.19411
## Median :1.0000 Median :-0.287648 Median :-0.50753 Median :-0.04873
## Mean :0.6847 Mean :-0.003183 Mean : 0.00545 Mean :-0.01659
## 3rd Qu.:1.0000 3rd Qu.: 0.285270 3rd Qu.: 0.10622 3rd Qu.: 0.66176
## Max. :1.0000 Max. :16.091238 Max. :38.70451 Max. : 7.25586
## dh_cant_entradas pcons_tarjeta_de_credito dh_avg_dia_retiros_d
## Min. :-1.060281 Min. :-0.675202 Min. :-1.221137
## 1st Qu.:-1.060281 1st Qu.:-0.675202 1st Qu.:-1.221137
## Median :-0.100676 Median :-0.675202 Median : 0.403731
## Mean : 0.002092 Mean :-0.005479 Mean : 0.006123
## 3rd Qu.: 0.760440 3rd Qu.: 1.480935 3rd Qu.: 0.789637
## Max. : 3.756894 Max. : 1.480935 Max. : 4.059682
## dh_cant_otros_d dmi_max_ingreso_diario dh_val_otros_d
## Min. :-0.864524 Min. :-0.593912 Min. :-0.4234
## 1st Qu.:-0.864524 1st Qu.:-0.593912 1st Qu.:-0.4234
## Median :-0.110089 Median :-0.402591 Median :-0.4194
## Mean : 0.003838 Mean : 0.008742 Mean : 0.0152
## 3rd Qu.: 0.644345 3rd Qu.: 0.173518 3rd Qu.:-0.1013
## Max. : 5.625365 Max. : 7.227389 Max. : 7.8260
## dh_cant_pagos_d dh_val_pagos_d pc_cuotas_pagadas dh_min_dia_pagos_d
## Min. :-0.77780 Min. :-0.54682 Min. :-0.61941 Min. :-0.655150
## 1st Qu.:-0.77780 1st Qu.:-0.54682 1st Qu.:-0.61941 1st Qu.:-0.655150
## Median :-0.29438 Median :-0.43672 Median :-0.35645 Median :-0.324356
## Mean : 0.02972 Mean : 0.02651 Mean : 0.01819 Mean :-0.007585
## 3rd Qu.: 0.51131 3rd Qu.: 0.14503 3rd Qu.: 0.16807 3rd Qu.: 0.171836
## Max. : 7.11798 Max. : 8.64759 Max. : 6.59100 Max. : 4.472163
## pc_ingreso_por_rutina cpc_saldo_sobre_ing dh_min_dia_pago_cred_d
## Min. :-0.80319 Min. :-0.40385 Min. :-0.68214
## 1st Qu.:-0.68304 1st Qu.:-0.36990 1st Qu.:-0.68214
## Median :-0.37979 Median :-0.34969 Median :-0.54303
## Mean :-0.01846 Mean : 0.03398 Mean :-0.01309
## 3rd Qu.: 0.27872 3rd Qu.:-0.09272 3rd Qu.: 0.29160
## Max. : 4.41872 Max. :10.50524 Max. : 3.63014
## cpc_saldo_tdc dh_cant_tras_d cpc_avg_saldo dh_val_pago_tarj_d
## Min. :-0.77679 Min. :-0.47777 Min. :-0.52372 Min. :-0.39327
## 1st Qu.:-0.62885 1st Qu.:-0.47777 1st Qu.:-0.43609 1st Qu.:-0.39327
## Median :-0.56706 Median :-0.47777 Median :-0.35621 Median :-0.39327
## Mean : 0.01752 Mean :-0.01044 Mean : 0.05172 Mean : 0.02682
## 3rd Qu.: 0.26016 3rd Qu.:-0.06338 3rd Qu.:-0.01338 3rd Qu.:-0.08126
## Max. : 5.44829 Max. : 5.94527 Max. : 8.49138 Max. : 9.44247
## flag_tuvo_mora60_ult3meses flag_diaPago_1quincena flag_es_cluster_6
## Min. :-0.38408 Min. :-1.55854 Min. :-1.008661
## 1st Qu.:-0.38408 1st Qu.:-1.55854 1st Qu.:-1.008661
## Median :-0.38408 Median : 0.64158 Median : 0.991344
## Mean : 0.01079 Mean :-0.03936 Mean : 0.002148
## 3rd Qu.:-0.38408 3rd Qu.: 0.64158 3rd Qu.: 0.991344
## Max. : 2.60348 Max. : 0.64158 Max. : 0.991344
## flag_tuvo_mora90_ult12M flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
## Min. :-0.2625 Min. :-1.13436 Min. :-1.45210
## 1st Qu.:-0.2625 1st Qu.:-1.13436 1st Qu.:-1.45210
## Median :-0.2625 Median : 0.88149 Median : 0.68861
## Mean : 0.0003 Mean :-0.00048 Mean : 0.01004
## 3rd Qu.:-0.2625 3rd Qu.: 0.88149 3rd Qu.: 0.68861
## Max. : 3.8087 Max. : 0.88149 Max. : 0.68861
Se utilizará el método SMOTE para balancear las clases de la variable respuesta
temp <- train
temp$Y <- as.factor(temp$Y)
class(temp$Y)
## [1] "factor"
S <- SMOTE(Y ~ ., data = temp, perc.over = 100)
table(S$Y)
##
## 0 1
## 8904 8904
S$Y <- as.numeric(S$Y) - 1
table(S$Y)
##
## 0 1
## 8904 8904
train2 <- S
# variables comentados por VIF > 4 o PValue muy alto
modelo_logistica <- glm(
Y ~
desv_sem
+ max_mes_anterior
+ desv_trim
+ dh_cant_entradas
+ pcons_tarjeta_de_credito
+ dh_avg_dia_retiros_d
#+ dh_cant_otros_d
#+ dmi_max_ingreso_diario
#+ dh_val_otros_d
+ dh_cant_pagos_d
#+ dh_val_pagos_d
+ pc_cuotas_pagadas
+ dh_min_dia_pagos_d
+ pc_ingreso_por_rutina
+ cpc_saldo_sobre_ing
+ dh_min_dia_pago_cred_d
+ cpc_saldo_tdc
#+ dh_cant_tras_d
+ cpc_avg_saldo
+ dh_val_pago_tarj_d
+ flag_tuvo_mora60_ult3meses
#+ flag_diaPago_1quincena
+ flag_es_cluster_6
+ flag_tuvo_mora90_ult12M
#+ flax_maxDiaOtrosD_1quincena
+ flag_encontrado_cifin
, data = train2
, family = "binomial"
)
# Prueba VIF
vif(modelo_logistica)
## desv_sem max_mes_anterior
## 2.498924 1.438054
## desv_trim dh_cant_entradas
## 2.428364 3.044172
## pcons_tarjeta_de_credito dh_avg_dia_retiros_d
## 1.093417 3.082506
## dh_cant_pagos_d pc_cuotas_pagadas
## 2.620900 1.221223
## dh_min_dia_pagos_d pc_ingreso_por_rutina
## 2.120878 1.195525
## cpc_saldo_sobre_ing dh_min_dia_pago_cred_d
## 2.036106 2.013672
## cpc_saldo_tdc cpc_avg_saldo
## 1.507262 2.061050
## dh_val_pago_tarj_d flag_tuvo_mora60_ult3meses
## 1.401443 1.691738
## flag_es_cluster_6 flag_tuvo_mora90_ult12M
## 1.202198 1.802167
## flag_encontrado_cifin
## 1.551095
# PVaues
summary(modelo_logistica)
##
## Call:
## glm(formula = Y ~ desv_sem + max_mes_anterior + desv_trim + dh_cant_entradas +
## pcons_tarjeta_de_credito + dh_avg_dia_retiros_d + dh_cant_pagos_d +
## pc_cuotas_pagadas + dh_min_dia_pagos_d + pc_ingreso_por_rutina +
## cpc_saldo_sobre_ing + dh_min_dia_pago_cred_d + cpc_saldo_tdc +
## cpc_avg_saldo + dh_val_pago_tarj_d + flag_tuvo_mora60_ult3meses +
## flag_es_cluster_6 + flag_tuvo_mora90_ult12M + flag_encontrado_cifin,
## family = "binomial", data = train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0183 -1.1163 0.2137 1.0492 3.5088
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.054021 0.016054 3.365 0.000765 ***
## desv_sem -0.416694 0.032800 -12.704 < 2e-16 ***
## max_mes_anterior -0.198661 0.026386 -7.529 5.11e-14 ***
## desv_trim -0.054705 0.026350 -2.076 0.037887 *
## dh_cant_entradas 0.337139 0.028536 11.815 < 2e-16 ***
## pcons_tarjeta_de_credito -0.108810 0.016486 -6.600 4.11e-11 ***
## dh_avg_dia_retiros_d -0.101548 0.028240 -3.596 0.000323 ***
## dh_cant_pagos_d -0.068394 0.025683 -2.663 0.007745 **
## pc_cuotas_pagadas 0.016334 0.017699 0.923 0.356092
## dh_min_dia_pagos_d -0.013568 0.023460 -0.578 0.563019
## pc_ingreso_por_rutina 0.007065 0.017877 0.395 0.692695
## cpc_saldo_sobre_ing -0.024267 0.023083 -1.051 0.293124
## dh_min_dia_pago_cred_d 0.075297 0.023111 3.258 0.001122 **
## cpc_saldo_tdc -0.177958 0.019884 -8.950 < 2e-16 ***
## cpc_avg_saldo 0.032348 0.023794 1.360 0.173984
## dh_val_pago_tarj_d 0.069909 0.019983 3.498 0.000468 ***
## flag_tuvo_mora60_ult3meses -0.169527 0.020893 -8.114 4.90e-16 ***
## flag_es_cluster_6 0.084413 0.017430 4.843 1.28e-06 ***
## flag_tuvo_mora90_ult12M -0.138459 0.021116 -6.557 5.49e-11 ***
## flag_encontrado_cifin -0.317650 0.020251 -15.685 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24687 on 17807 degrees of freedom
## Residual deviance: 22554 on 17788 degrees of freedom
## AIC: 22594
##
## Number of Fisher Scoring iterations: 4
# Evaluar Performance
rendimiento <- function(modelo, dataset, corte)
{
#modelo <- modelo_logistica
#dataset <- test
#corte <- 0.5
R <- test$Y
PP <- as.numeric(predict(modelo_logistica, test, type = "response"))
# hist(PP)
P <- ifelse(PP >= corte, 1, 0)
tabla <- table(R,P)
acc <- round(sum(diag(tabla)) / sum(tabla),2)
auc <- round(as.numeric(roc(R,P)$auc),2)
gini <- (2 * auc) - 1
T <- as.data.frame(cbind(R,P))
TNR <- round(nrow(subset(T, T$R == 0 & T$P == 0)) / nrow(subset(T, T$R == 0)),2)
TPR <- round(nrow(subset(T, T$R == 1 & T$P == 1)) / nrow(subset(T, T$R == 1)),2)
R <- as.data.frame(cbind(corte, acc, auc, gini, TNR, TPR))
return(R)
}
# Buscar mejor punto de corte
rendimiento(modelo_logistica, test, 0.5)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.5 0.65 0.63 0.26 0.58 0.68
# buscar mejor punto de corte
for(i in seq(0.2, 0.8, by = 0.01))
{
print(rendimiento(modelo_logistica, test, i))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.2 0.7 0.54 0.08 0.1 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.21 0.7 0.54 0.08 0.11 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.22 0.7 0.54 0.08 0.11 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.23 0.7 0.54 0.08 0.12 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.24 0.7 0.55 0.1 0.13 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.25 0.7 0.55 0.1 0.13 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.26 0.7 0.55 0.1 0.14 0.95
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.27 0.7 0.55 0.1 0.15 0.95
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.28 0.7 0.56 0.12 0.17 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.29 0.7 0.56 0.12 0.18 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.3 0.7 0.57 0.14 0.2 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.31 0.7 0.57 0.14 0.21 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.32 0.7 0.58 0.16 0.23 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.33 0.71 0.58 0.16 0.24 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.34 0.71 0.59 0.18 0.27 0.91
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.35 0.71 0.6 0.2 0.29 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.36 0.71 0.6 0.2 0.3 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.37 0.71 0.6 0.2 0.32 0.88
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.38 0.7 0.61 0.22 0.34 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.39 0.7 0.61 0.22 0.35 0.86
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.4 0.7 0.61 0.22 0.36 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.41 0.7 0.61 0.22 0.38 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.42 0.69 0.61 0.22 0.4 0.82
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.43 0.68 0.61 0.22 0.42 0.81
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.44 0.68 0.62 0.24 0.45 0.79
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.45 0.68 0.62 0.24 0.47 0.77
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.46 0.67 0.63 0.26 0.5 0.75
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.47 0.67 0.62 0.24 0.51 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.48 0.66 0.62 0.24 0.54 0.71
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.49 0.65 0.62 0.24 0.55 0.69
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.5 0.65 0.63 0.26 0.58 0.68
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.51 0.64 0.63 0.26 0.6 0.66
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.52 0.63 0.63 0.26 0.62 0.63
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.53 0.62 0.63 0.26 0.65 0.6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.54 0.61 0.63 0.26 0.68 0.58
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.55 0.6 0.62 0.24 0.69 0.56
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.56 0.59 0.62 0.24 0.71 0.53
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.57 0.57 0.61 0.22 0.73 0.5
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.58 0.56 0.61 0.22 0.76 0.47
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.59 0.54 0.61 0.22 0.77 0.44
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.6 0.53 0.6 0.2 0.79 0.41
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.61 0.52 0.59 0.18 0.8 0.38
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.62 0.5 0.59 0.18 0.83 0.36
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.63 0.49 0.58 0.16 0.84 0.33
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.64 0.48 0.58 0.16 0.86 0.31
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.65 0.47 0.58 0.16 0.87 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.66 0.46 0.58 0.16 0.89 0.26
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.67 0.44 0.57 0.14 0.9 0.23
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.68 0.43 0.56 0.12 0.92 0.21
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.69 0.42 0.56 0.12 0.93 0.18
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.7 0.4 0.55 0.1 0.94 0.16
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.71 0.39 0.54 0.08 0.95 0.13
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.72 0.38 0.53 0.06 0.96 0.11
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.73 0.36 0.53 0.06 0.97 0.08
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.74 0.35 0.52 0.04 0.97 0.07
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.75 0.35 0.52 0.04 0.98 0.06
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.76 0.33 0.51 0.02 0.99 0.03
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.77 0.33 0.51 0.02 0.99 0.03
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.78 0.32 0.5 0 0.99 0.02
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.79 0.32 0.5 0 0.99 0.01
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.8 0.32 0.5 0 0.99 0.01
# Finalmente
resultado1 <- as.data.frame(cbind("RegLog", rendimiento(modelo_logistica, test, 0.5)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(resultado1)[1] <- "Modelo"
print(resultado1)
## Modelo corte acc auc gini TNR TPR
## 1 RegLog 0.5 0.65 0.63 0.26 0.58 0.68
Se entrenó un random forest tanto con datos sin balancear como con datos balanceados a través del método SMOTE (páquete DMwR); finalmente, el 2° random forest presentó el mejor AUC con un punto de corte de 0.5
# Usar el FOR loop para buscar los mejores parámetros
# Evaluar Performance
rendimiento <- function(modelo, dataset, corte)
{
# modelo <- RF1
# dataset <- test
# corte <- 0.5
R <- test$Y
PP <- as.numeric(predict(modelo, dataset, type = "prob")[,2])
# hist(PP)
P <- ifelse(PP >= corte, 1, 0)
tabla <- table(R,P)
acc <- round(sum(diag(tabla)) / sum(tabla),2)
auc <- round(as.numeric(roc(R,P)$auc),2)
gini <- (2 * auc) - 1
T <- as.data.frame(cbind(R,P))
TNR <- round(nrow(subset(T, T$R == 0 & T$P == 0)) / nrow(subset(T, T$R == 0)),2)
TPR <- round(nrow(subset(T, T$R == 1 & T$P == 1)) / nrow(subset(T, T$R == 1)),2)
R <- as.data.frame(cbind(corte, acc, auc, gini, TNR, TPR))
return(R)
}
# random forest con data sin balancear
for(parametro in 1:1)
{
# Modelado
set.seed(1)
RF1 <- randomForest(
as.factor(Y) ~ .
, data = train
, ntree = 250
, mtry = 7
, classwt = c(1.5, 1)
)
# Evaluar performance
print(cbind(parametro, rendimiento(RF1, test, 0.5)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## parametro corte acc auc gini TNR TPR
## 1 1 0.5 0.75 0.67 0.34 0.44 0.89
# random forest con data balanceada
for(parametro in 1:1)
{
# Modelado
set.seed(1)
RF2 <- randomForest(
as.factor(Y) ~ .
, data = train2
, ntree = 100
, mtry = 7
, replace = TRUE
)
# Evaluar performance
print(cbind(parametro, rendimiento(RF2, test, 0.5)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## parametro corte acc auc gini TNR TPR
## 1 1 0.5 0.71 0.71 0.42 0.7 0.71
# Evaluar todos los puntos de corte
for(i in seq(0.2, 0.8, by = 0.01))
{
print(rendimiento(RF2, test, i))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.2 0.74 0.6 0.2 0.23 0.97
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.21 0.74 0.61 0.22 0.26 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.22 0.74 0.61 0.22 0.26 0.96
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.23 0.74 0.62 0.24 0.28 0.95
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.24 0.74 0.62 0.24 0.3 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.25 0.74 0.62 0.24 0.3 0.94
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.26 0.74 0.63 0.26 0.32 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.27 0.75 0.64 0.28 0.34 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.28 0.75 0.64 0.28 0.36 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.29 0.75 0.65 0.3 0.4 0.91
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.3 0.75 0.66 0.32 0.41 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.31 0.75 0.66 0.32 0.41 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.32 0.74 0.66 0.32 0.42 0.89
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.33 0.75 0.66 0.32 0.44 0.89
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.34 0.75 0.67 0.34 0.46 0.88
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.35 0.74 0.67 0.34 0.47 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.36 0.74 0.68 0.36 0.49 0.86
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.37 0.74 0.68 0.36 0.5 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.38 0.74 0.68 0.36 0.52 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.39 0.74 0.69 0.38 0.54 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.4 0.74 0.69 0.38 0.56 0.82
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.41 0.73 0.69 0.38 0.59 0.8
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.42 0.73 0.69 0.38 0.6 0.79
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.43 0.73 0.7 0.4 0.61 0.78
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.44 0.73 0.7 0.4 0.61 0.78
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.45 0.72 0.7 0.4 0.63 0.77
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.46 0.72 0.7 0.4 0.64 0.76
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.47 0.72 0.7 0.4 0.67 0.74
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.48 0.71 0.7 0.4 0.68 0.72
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.49 0.71 0.7 0.4 0.68 0.72
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.5 0.71 0.71 0.42 0.7 0.71
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.51 0.7 0.7 0.4 0.71 0.7
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.52 0.7 0.7 0.4 0.72 0.69
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.53 0.69 0.7 0.4 0.73 0.67
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.54 0.68 0.7 0.4 0.74 0.65
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.55 0.68 0.7 0.4 0.76 0.64
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.56 0.67 0.7 0.4 0.77 0.62
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.57 0.66 0.7 0.4 0.79 0.6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.58 0.65 0.69 0.38 0.8 0.59
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.59 0.65 0.69 0.38 0.82 0.57
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.6 0.64 0.69 0.38 0.82 0.55
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.61 0.63 0.68 0.36 0.83 0.53
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.62 0.63 0.68 0.36 0.83 0.53
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.63 0.62 0.68 0.36 0.85 0.52
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.64 0.61 0.68 0.36 0.86 0.5
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.65 0.61 0.68 0.36 0.87 0.48
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.66 0.6 0.67 0.34 0.88 0.47
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.67 0.59 0.67 0.34 0.88 0.45
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.68 0.58 0.66 0.32 0.89 0.44
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.69 0.57 0.66 0.32 0.89 0.42
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.7 0.56 0.65 0.3 0.9 0.4
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.71 0.55 0.64 0.28 0.9 0.38
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.72 0.54 0.64 0.28 0.91 0.37
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.73 0.53 0.63 0.26 0.91 0.35
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.74 0.52 0.62 0.24 0.92 0.33
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.75 0.51 0.62 0.24 0.92 0.32
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.76 0.5 0.62 0.24 0.93 0.3
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.77 0.49 0.61 0.22 0.94 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.78 0.49 0.61 0.22 0.94 0.28
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.79 0.48 0.61 0.22 0.95 0.27
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.8 0.47 0.6 0.2 0.95 0.25
# finalmente:
resultado2 <- as.data.frame(cbind("RandomForest", rendimiento(RF2, test, 0.5)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(resultado2)[1] <- "Modelo"
print(resultado2)
## Modelo corte acc auc gini TNR TPR
## 1 RandomForest 0.5 0.71 0.71 0.42 0.7 0.71
# Adempas, las probabilidades obtenidas tienen la siguiente forma:
hist(as.numeric(predict(RF2, test, type = "prob")[,2]))
# Preparar todo para el modelo
usar <- train2
usar$Y <- as.factor(usar$Y)
set.seed(1)
# Entrenar modelo con todas las variables por default
modeloAda <- boosting(
Y ~
desv_sem # Importance: 33.7176260590549
+ max_mes_anterior # Importance: 18.8506694661822
+ dh_cant_tras_d # Importance: 6.66562441816714
+ dh_min_dia_pagos_d # Importance: 5.74571007099648
+ flag_tuvo_mora60_ult3meses # Importance: 5.27516596907519
+ pc_cuotas_pagadas # Importance: 4.94772841710666
+ dh_cant_entradas # Importance: 3.6451690230774
+ flag_encontrado_cifin # Importance: 3.51978408061676
+ pc_ingreso_por_rutina # Importance: 3.20238547917842
+ desv_trim # Importance: 3.06059257514681
+ cpc_saldo_tdc # Importance: 2.89533364487582
+ dh_cant_otros_d # Importance: 2.49716825059181
+ dh_val_pago_tarj_d # Importance: 1.03252217294607
+ dh_cant_pagos_d # Importance: 0.899084100587163
+ cpc_avg_saldo # Importance: 0.860227126123119
+ dh_min_dia_pago_cred_d # Importance: 0.790252390234587
+ pcons_tarjeta_de_credito # Importance: 0.623763602744435
+ dh_val_otros_d # Importance: 0.525760920959076
+ dmi_max_ingreso_diario # Importance: 0.316415754836402
+ cpc_saldo_sobre_ing # Importance: 0.313236673318644
+ dh_val_pagos_d # Importance: 0.266451837486692
+ dh_avg_dia_retiros_d # Importance: 0.168203660652142
+ flag_diaPago_1quincena # Importance: 0.104916028226669
+ flag_es_cluster_6 # Importance: 0.0762082778154691
+ flag_tuvo_mora90_ult12M # Importance: 0
+ flax_maxDiaOtrosD_1quincena # Importance: 0
, data = usar
)
# Imprimir importancia
modeloAda$importance # importancia de las variables
## cpc_avg_saldo cpc_saldo_sobre_ing
## 0.62945995 0.62137479
## cpc_saldo_tdc desv_sem
## 2.02408780 29.34513646
## desv_trim dh_avg_dia_retiros_d
## 5.06654437 0.38408579
## dh_cant_entradas dh_cant_otros_d
## 3.96578792 2.53680494
## dh_cant_pagos_d dh_cant_tras_d
## 1.01793538 11.50905232
## dh_min_dia_pago_cred_d dh_min_dia_pagos_d
## 1.07949399 5.87323983
## dh_val_otros_d dh_val_pago_tarj_d
## 0.48679549 1.63684079
## dh_val_pagos_d dmi_max_ingreso_diario
## 0.45164458 0.56812405
## flag_diaPago_1quincena flag_encontrado_cifin
## 0.04640731 2.98015468
## flag_es_cluster_6 flag_tuvo_mora60_ult3meses
## 0.23381769 3.30989601
## flag_tuvo_mora90_ult12M flax_maxDiaOtrosD_1quincena
## 0.23937819 0.00000000
## max_mes_anterior pc_cuotas_pagadas
## 18.36357557 4.13535714
## pc_ingreso_por_rutina pcons_tarjeta_de_credito
## 2.78858873 0.70641620
# Evaluar Performance del modelo
rendimiento <- function(modelo, dataset, corte)
{
# modelo <- modeloAda
# dataset <- test
# corte <- 0.5
R <- test$Y
PP <- (as.numeric(predict(modelo, dataset)$prob[,2]))
# hist(PP)
P <- ifelse(PP >= corte, 1, 0)
tabla <- table(R,P)
acc <- round(sum(diag(tabla)) / sum(tabla),2)
auc <- round(as.numeric(roc(R,P)$auc),2)
gini <- (2 * auc) - 1
T <- as.data.frame(cbind(R,P))
TNR <- round(nrow(subset(T, T$R == 0 & T$P == 0)) / nrow(subset(T, T$R == 0)),2)
TPR <- round(nrow(subset(T, T$R == 1 & T$P == 1)) / nrow(subset(T, T$R == 1)),2)
R <- as.data.frame(cbind(corte, acc, auc, gini, TNR, TPR))
return(R)
}
# Modelo omitiendo algunas variables
modeloAda <- boosting(
Y ~
desv_sem # Importance: 33.7176260590549
+ max_mes_anterior # Importance: 18.8506694661822
+ dh_cant_tras_d # Importance: 6.66562441816714
+ dh_min_dia_pagos_d # Importance: 5.74571007099648
+ flag_tuvo_mora60_ult3meses # Importance: 5.27516596907519
+ pc_cuotas_pagadas # Importance: 4.94772841710666
+ dh_cant_entradas # Importance: 3.6451690230774
+ flag_encontrado_cifin # Importance: 3.51978408061676
+ pc_ingreso_por_rutina # Importance: 3.20238547917842
+ desv_trim # Importance: 3.06059257514681
+ cpc_saldo_tdc # Importance: 2.89533364487582
+ dh_cant_otros_d # Importance: 2.49716825059181
+ dh_val_pago_tarj_d # Importance: 1.03252217294607
#+ dh_cant_pagos_d # Importance: 0.899084100587163
#+ cpc_avg_saldo # Importance: 0.860227126123119
#+ dh_min_dia_pago_cred_d # Importance: 0.790252390234587
#+ pcons_tarjeta_de_credito # Importance: 0.623763602744435
#+ dh_val_otros_d # Importance: 0.525760920959076
#+ dmi_max_ingreso_diario # Importance: 0.316415754836402
#+ cpc_saldo_sobre_ing # Importance: 0.313236673318644
#+ dh_val_pagos_d # Importance: 0.266451837486692
#+ dh_avg_dia_retiros_d # Importance: 0.168203660652142
#+ flag_diaPago_1quincena # Importance: 0.104916028226669
#+ flag_es_cluster_6 # Importance: 0.0762082778154691
#+ flag_tuvo_mora90_ult12M # Importance: 0
#+ flax_maxDiaOtrosD_1quincena # Importance: 0
, data = usar
, mfinal = 10
, boos = TRUE
)
# Performance del modelo
rendimiento(modeloAda, test, 0.5)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.5 0.66 0.64 0.28 0.59 0.69
# Buscar mejor punto de corte
for(i in seq(0.2, 0.8, by = 0.01))
{
print(rendimiento(modeloAda, test, i))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.2 0.71 0.58 0.16 0.22 0.93
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.21 0.71 0.58 0.16 0.25 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.22 0.71 0.58 0.16 0.25 0.92
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.23 0.7 0.59 0.18 0.26 0.91
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.24 0.7 0.59 0.18 0.27 0.9
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.25 0.7 0.59 0.18 0.3 0.89
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.26 0.7 0.6 0.2 0.31 0.88
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.27 0.7 0.6 0.2 0.33 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.28 0.7 0.6 0.2 0.34 0.87
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.29 0.7 0.6 0.2 0.35 0.86
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.3 0.7 0.61 0.22 0.36 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.31 0.7 0.61 0.22 0.37 0.85
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.32 0.7 0.61 0.22 0.38 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.33 0.69 0.61 0.22 0.39 0.84
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.34 0.69 0.62 0.24 0.41 0.83
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.35 0.69 0.62 0.24 0.41 0.82
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.36 0.69 0.62 0.24 0.43 0.81
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.37 0.69 0.62 0.24 0.45 0.8
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.38 0.69 0.62 0.24 0.45 0.8
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.39 0.69 0.63 0.26 0.46 0.79
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.4 0.68 0.63 0.26 0.48 0.77
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.41 0.67 0.63 0.26 0.5 0.76
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.42 0.68 0.63 0.26 0.51 0.75
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.43 0.68 0.63 0.26 0.52 0.75
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.44 0.67 0.63 0.26 0.53 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.45 0.67 0.63 0.26 0.54 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.46 0.67 0.63 0.26 0.54 0.73
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.47 0.67 0.64 0.28 0.56 0.72
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.48 0.66 0.64 0.28 0.56 0.71
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.49 0.66 0.64 0.28 0.57 0.7
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.5 0.66 0.64 0.28 0.59 0.69
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.51 0.65 0.64 0.28 0.6 0.68
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.52 0.65 0.64 0.28 0.6 0.67
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.53 0.65 0.64 0.28 0.61 0.67
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.54 0.64 0.64 0.28 0.62 0.66
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.55 0.64 0.64 0.28 0.63 0.64
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.56 0.63 0.64 0.28 0.64 0.63
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.57 0.63 0.64 0.28 0.66 0.61
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.58 0.62 0.64 0.28 0.68 0.6
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.59 0.62 0.64 0.28 0.69 0.59
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.6 0.62 0.64 0.28 0.69 0.59
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.61 0.61 0.63 0.26 0.7 0.57
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.62 0.61 0.63 0.26 0.7 0.56
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.63 0.6 0.64 0.28 0.72 0.55
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.64 0.59 0.63 0.26 0.74 0.52
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.65 0.59 0.63 0.26 0.74 0.52
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.66 0.58 0.63 0.26 0.75 0.51
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.67 0.57 0.62 0.24 0.75 0.49
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.68 0.56 0.62 0.24 0.77 0.47
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.69 0.56 0.62 0.24 0.79 0.45
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.7 0.55 0.62 0.24 0.8 0.44
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.71 0.55 0.62 0.24 0.81 0.42
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.72 0.54 0.62 0.24 0.81 0.42
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.73 0.54 0.61 0.22 0.82 0.4
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.74 0.5 0.59 0.18 0.86 0.33
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.75 0.49 0.59 0.18 0.86 0.32
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.76 0.49 0.59 0.18 0.86 0.32
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.77 0.48 0.59 0.18 0.87 0.31
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.78 0.48 0.58 0.16 0.87 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.79 0.47 0.58 0.16 0.87 0.29
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## corte acc auc gini TNR TPR
## 1 0.8 0.46 0.58 0.16 0.9 0.26
resultado3 <- cbind("AdaBoost", rendimiento(modeloAda, test, 0.56))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(resultado3)[1] <- "Modelo"
resultado3
## Modelo corte acc auc gini TNR TPR
## 1 AdaBoost 0.56 0.63 0.64 0.28 0.64 0.63
El modelo ganador es un Random Forest usando la data balanceada, ntree = 100 y mtry = 7. Este modelo nos da un AUC de 0.71 y un GINI de 0.42. Dado que el AUC es >= 0.70 el modelo se considera aceptable y viable para su utilización, pero se recomiendo buscar un AUC >= 0.80 a través de:
# Finalmente...
rbind(
resultado1,
resultado2,
resultado3
)
## Modelo corte acc auc gini TNR TPR
## 1 RegLog 0.50 0.65 0.63 0.26 0.58 0.68
## 2 RandomForest 0.50 0.71 0.71 0.42 0.70 0.71
## 3 AdaBoost 0.56 0.63 0.64 0.28 0.64 0.63
original <- read.csv("Base_prueba.csv", header = TRUE)
dim(original) # cargadas 20,000 filas y 124 columnas
## [1] 1000 123
data <- original # sobre este dataset se hará el feature engineer
# Omitir datos atípicos
A <- nrow(data)
data <- subset(data, data$pc_transaccional <= 136903394)
data <- subset(data, data$dmi_max_egreso_diario <= 85362328)
data <- subset(data, data$dmi_max_ingreso_diario <= 104909227.3)
data <- subset(data, data$dh_val_otros_d <= 9998335.80)
data <- subset(data, data$dmi_ingreso_total_mes <= 205163705.0)
data <- subset(data, data$dh_val_pagos_d <= 50357360.00)
data <- subset(data, data$pc_gasto_familiar <= 26582180.19)
data <- subset(data, data$pc_cuotas_pagadas <= 8156850.0)
data <- subset(data, data$dmi_egreso_total_mes <= 209829245.0)
data <- subset(data, data$dh_val_salidas <= 217011100)
data <- subset(data, data$cpc_sum_saldo <= 220905770.00)
data <- subset(data, data$cpc_saldo_sobre_ing <= 20.307294513)
data <- subset(data, data$cpc_saldo_tdc <= 43186883.96)
data <- subset(data, data$dh_cant_tras_d <= 31)
data <- subset(data, data$cpc_avg_saldo <= 106884393.50)
data <- subset(data, data$dh_val_pago_tarj_d <= 10560315.0)
data <- subset(data, data$pc_transaccional <= 92269310.0)
B <- nrow(data)
paste(A-B,"registros omitidos",sep = " ")
## [1] "98 registros omitidos"
# Exploración y Transformación de variables
names(data)[1] # max_trim: máximo días de mora trimestre anterior
## [1] "max_trim"
hist(data$max_trim)
hist(sqrt(data$max_trim)) # variable normalizada con sqrt
max_trim <- sqrt(data$max_trim)
names(data)[2] # max_sem: máximo días de mora en el semestre anterior
## [1] "max_sem"
hist(data$max_sem)
hist(sqrt(data$max_sem))
max_sem <- data$max_sem
names(data)[3] # desv_sem: desviación estándar del máximo de moras en el semestre anterior
## [1] "desv_sem"
hist(data$desv_sem)
hist(sqrt(data$desv_sem))
desv_sem <- data$desv_sem
names(data)[4] # prom_bim: promedio del máximo de moras en el bimestre anterior
## [1] "prom_bim"
hist(data$prom_bim)
hist(sqrt(data$prom_bim))
prom_bim <- sqrt(data$prom_bim)
names(data)[5] # max_mes_anterior: días de mora máximo en el mes anterior
## [1] "max_mes_anterior"
hist(data$max_mes_anterior)
hist(sqrt(data$max_mes_anterior))
max_mes_anterior <- data$max_mes_anterior
names(data)[6] # prom_mes_anterior: Promedio de los dias de mora en el mes anterior
## [1] "prom_mes_anterior"
hist(data$prom_mes_anterior)
hist(sqrt(data$prom_mes_anterior))
prom_mes_anterior <- sqrt(data$prom_mes_anterior)
names(data)[7] # prom_sem: promedio del máximo de moras en el semestre anterior
## [1] "prom_sem"
hist(data$prom_sem)
hist(sqrt(data$prom_sem))
prom_sem <- data$prom_sem
names(data)[8] # max_bim: máximo días de mora en el bimestre anterior
## [1] "max_bim"
hist(data$max_bim)
hist(sqrt(data$max_bim))
max_bim <- data$max_bim
names(data)[9] # mejor_gestion: Mejor gestion realizada
## [1] "mejor_gestion"
table(data$mejor_gestion)
##
## 0 15
## 901 1
names(data)[10] # prom_trim: promedio del máximo de moras en el trimestre anterior
## [1] "prom_trim"
hist(data$prom_trim)
hist(sqrt(data$prom_trim))
prom_trim <- sqrt(data$prom_trim)
names(data)[11] # pc_cant_moras_30_ult_12_meses: Cantidad de moras 30 en los últimos 12 meses <= 5 o vacío.
## [1] "pc_cant_moras_30_ult_12_meses"
summary(data$pc_cant_moras_30_ult_12_meses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.8758 1.0000 12.0000
table(data$pc_cant_moras_30_ult_12_meses)
##
## 0 1 2 3 4 6 7 8 9 10 11 12
## 639 124 24 31 32 18 12 3 12 5 1 1
pc_cant_moras_30_ult_12_meses <- data$pc_cant_moras_30_ult_12_meses
names(data)[12] # desv_trim: desviación estándar del máximo de moras en el trimestre anterior
## [1] "desv_trim"
hist(data$desv_trim)
hist(sqrt(data$desv_trim))
desv_trim <- sqrt(data$desv_trim)
names(data)[13] # nro_gestiones: Numero de gestiones realizadas
## [1] "nro_gestiones"
table(data$nro_gestiones)
##
## 0 2
## 901 1
names(data)[14] # desv_bim: desviación estándar del máximo días de mora en el bimestre anterior
## [1] "desv_bim"
hist(data$desv_bim)
hist(sqrt(data$desv_bim))
desv_bim <- sqrt(data$desv_bim)
names(data)[15] # pc_cant_moras_30_ult_3_meses: Cantidad de moras 30 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_30_ult_3_meses"
summary(data$pc_cant_moras_30_ult_3_meses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3869 0.0000 3.0000
hist(data$pc_cant_moras_30_ult_3_meses)
names(data)[16] # dh_cant_entradas: cantidad de trasacciones de ingreso de dinero tuvo en el mes anterior
## [1] "dh_cant_entradas"
summary(data$dh_cant_entradas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 6.00 11.59 15.00 129.00
hist(data$dh_cant_entradas)
hist(sqrt(data$dh_cant_entradas))
dh_cant_entradas <- sqrt(data$dh_cant_entradas)
names(data)[17] # pc_tiem_1er_prod_abierto_total: Tiempo total de producto abierto
## [1] "pc_tiem_1er_prod_abierto_total"
table(data$pc_tiem_1er_prod_abierto_total)
##
## 0 34 63 65 66 122
## 897 1 1 1 1 1
hist(data$pc_tiem_1er_prod_abierto_total)
names(data)[18] # pc_cant_moras_60_ult_12_meses: Cantidad de moras 60 en los últimos 12 meses <= 1 o vacío.
## [1] "pc_cant_moras_60_ult_12_meses"
table(data$pc_cant_moras_60_ult_12_meses)
##
## 0 1 2 3 4 5 6 7 8 9 12
## 778 27 35 23 5 5 6 11 6 5 1
hist(data$pc_cant_moras_60_ult_12_meses) # se optará por hacerla binaria
x <- ifelse(data$pc_cant_moras_60_ult_12_meses >= 1, 1, 0)
table(x)
## x
## 0 1
## 778 124
flag_mora60_ult12meses <- x
names(data)[19] # gestiones_eficaces: Cantidad de gestiones eficaces
## [1] "gestiones_eficaces"
table(data$gestiones_eficaces)
##
## 0 1
## 901 1
names(data)[20] # pc_transaccional: Ingreso de acuerdo al estimador transaccional del cliente
## [1] "pc_transaccional"
summary(data$pc_transaccional)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3520425 8811787 15070019 20470286 87552251
boxplot(data$pc_transaccional) # valores atípicos en el lado superior de la variable
quantile(data$pc_transaccional, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5% 6% 7%
## 0 0 0 0 0 0 0 0
## 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 12000 125570 434500
## 16% 17% 18% 19% 20% 21% 22% 23%
## 931700 1232876 1583097 2327651 2390242 2598273 2905406 3127205
## 24% 25% 26% 27% 28% 29% 30% 31%
## 3327102 3520425 3654249 3880363 3900000 4154062 4341266 4785855
## 32% 33% 34% 35% 36% 37% 38% 39%
## 4785855 5056821 5270000 5446336 5684445 5822250 6001779 6187601
## 40% 41% 42% 43% 44% 45% 46% 47%
## 6392479 6392479 6487250 6531800 6707020 6757348 6964516 7264240
## 48% 49% 50% 51% 52% 53% 54% 55%
## 7654414 8280285 8811787 9280070 9411475 9826828 10069905 10765247
## 56% 57% 58% 59% 60% 61% 62% 63%
## 11115000 11212284 11863116 12310765 12733000 12796336 13160097 13700000
## 64% 65% 66% 67% 68% 69% 70% 71%
## 14952138 15247194 15386678 15532112 16000000 16566361 16929221 18004000
## 72% 73% 74% 75% 76% 77% 78% 79%
## 18543000 18662584 20000000 20470286 20470286 21185026 21500000 22029301
## 80% 81% 82% 83% 84% 85% 86% 87%
## 24256619 24256619 24613534 25494191 27920966 28156019 29246260 30636482
## 88% 89% 90% 91% 92% 93% 94% 95%
## 33093800 37612700 37940660 42301382 42852690 45546580 54933792 55731375
## 96% 97% 98% 99% 100%
## 59325287 64094730 78140440 78196633 87552251
x <- subset(data$pc_transaccional, data$pc_transaccional <= 92269310.0)
hist(x)
pc_transaccional <- data$pc_transaccional
names(data)[21] # dh_max_dia_entradas: Ultimo dia en que recibio alguna transaccion de ingreso de dinero
## [1] "dh_max_dia_entradas"
table(data$dh_max_dia_entradas)
##
## 0 1 2 3 4 5 6 7 9 10 11 12 13 14 15 16 18 20 21 22
## 291 2 3 1 2 1 2 1 1 1 1 1 4 6 2 1 5 2 2 5
## 23 24 25 26 27 28 29 30 31
## 11 8 19 8 23 78 24 145 252
hist(data$dh_max_dia_entradas) # se volverá binaria
x <- ifelse(data$dh_max_dia_entradas <= 15, 1, 0)
table(x)
## x
## 0 1
## 583 319
flag_ultima_entrada_1quincena <- x
names(data)[22] # pc_cupo_entidad: Cupo de las tarjetas de crédito en el banco
## [1] "pc_cupo_entidad"
summary(data$pc_cupo_entidad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 21508 0 17000000
hist(data$pc_cupo_entidad)
table(ifelse(data$pc_cupo_entidad >= 1, 1, 0))
##
## 0 1
## 899 3
names(data)[23] # pc_cuotas_como_ppal: Cuotas pagadas como principal
## [1] "pc_cuotas_como_ppal"
summary(data$pc_cuotas_como_ppal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 18017 0 6103800
hist(data$pc_cuotas_como_ppal)
names(data)[24] # dh_val_entradas: Valor total de los ingresos tomados en el mes anterior
## [1] "dh_val_entradas"
summary(data$dh_val_entradas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 5685000 17657538 25060713 181987436
hist(data$dh_val_entradas)
hist(sqrt(data$dh_val_entradas))
dh_val_entradas <- sqrt(data$dh_val_entradas)
names(data)[25] # pc_cant_moras_90_ult_12_meses: Cantidad de moras 90 o superiores en los últimos 12 meses = 0 o vacío.
## [1] "pc_cant_moras_90_ult_12_meses"
table(data$pc_cant_moras_90_ult_12_meses)
##
## 0 1 2 3 4 5 6 7 12
## 814 27 22 7 4 13 3 11 1
hist(data$pc_cant_moras_90_ult_12_meses)
hist(sqrt(data$pc_cant_moras_90_ult_12_meses))
names(data)[26] # dh_max_dia_salidas
## [1] "dh_max_dia_salidas"
# prefiero quedarme con la misma variable pero para el mes actual
names(data)[27] # pc_cant_moras_60_ult_3_meses: Cantidad de moras 60 en los últimos 3 meses = 0 o vacío.
## [1] "pc_cant_moras_60_ult_3_meses"
table(data$pc_cant_moras_60_ult_3_meses)
##
## 0 1 2 3
## 800 33 24 45
table(ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0))
##
## 0 1
## 800 102
flag_tuvo_mora60_ult3meses <- ifelse(data$pc_cant_moras_60_ult_3_meses >= 1, 1, 0)
names(data)[28] # pc_cuota_tarjeta_de_credito: Cuota de tarjeta de crédito reportada por CIFIN
## [1] "pc_cuota_tarjeta_de_credito"
summary(data$pc_cuota_tarjeta_de_credito)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 9882 0 4726800
hist(data$pc_cuota_tarjeta_de_credito)
table(ifelse(data$pc_cuota_tarjeta_de_credito >= 1, 1, 0))
##
## 0 1
## 897 5
names(data)[29] # cp_inicial_menos_saldo: valor inicial menos el saldo en el mes anterior por producto
## [1] "cp_inicial_menos_saldo"
summary(data$cp_inicial_menos_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[30] # pc_peor_estado_act_cta_aho: Peor estado cuenta ahorro
## [1] "pc_peor_estado_act_cta_aho"
summary(data$pc_peor_estado_act_cta_aho)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0133 0.0000 3.0000
hist(data$pc_peor_estado_act_cta_aho)
names(data)[31] # dia_pago: Dia de pago de la obligacion en el mes
## [1] "dia_pago"
table(data$dia_pago)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 38 12 198 11 13 8 12 9 14 13 19 9 15 11 246 17 103 26 9 10
## 21 22 23 24 25 26 27 28 29 30 31
## 5 10 10 14 8 14 9 4 17 17 1
x <- ifelse(data$dia_pago <= 15, 1, 0)
table(x)
## x
## 0 1
## 274 628
flag_diaPago_1quincena <- x
names(data)[32] # cp_cuotas_falta: Cantidad de cuotas faltantes
## [1] "cp_cuotas_falta"
table(data$cp_cuotas_falta)
##
## 0
## 902
names(data)[33] # pcons_tarjeta_de_credito: Es el producto una tarjeta de credito
## [1] "pcons_tarjeta_de_credito"
table(data$pcons_tarjeta_de_credito)
##
## 0 1
## 607 295
pcons_tarjeta_de_credito <- data$pcons_tarjeta_de_credito
names(data)[34] # pc_cifin: Ingreso del cliente de acuerdo a CIFIN
## [1] "pc_cifin"
summary(data$pc_cifin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 21787 0 7505630
hist(data$pc_cifin)
names(data)[35] # gestiones_prod: Numero de gestiones productivas
## [1] "gestiones_prod"
table(data$gestiones_prod)
##
## 0 1
## 901 1
names(data)[36] # pcons_vehiculos_sufi: Es el producto un vehiculo sufi
## [1] "pcons_vehiculos_sufi"
table(data$pcons_vehiculos_sufi)
##
## 0 1
## 836 66
names(data)[37] # cluster_recod: segmento cluster
## [1] "cluster_recod"
table(data$cluster_recod)
##
## 1 6 7 13 16 19
## 8 458 246 82 70 38
x <- ifelse(data$cluster_recod == 6, 1, 0)
table(x)
## x
## 0 1
## 444 458
flag_es_cluster_6 <- x
names(data)[38] # dh_avg_dia_retiros_d: Dia promedio del mes en el que realiza los retiros
## [1] "dh_avg_dia_retiros_d"
hist(data$dh_avg_dia_retiros_d)
dh_avg_dia_retiros_d <- data$dh_avg_dia_retiros_d
names(data)[39] # dmi_max_egreso_diario: Maximo egreso en un dia del mes anterior
## [1] "dmi_max_egreso_diario"
summary(data$dmi_max_egreso_diario)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 2509288 5687549 8127384 82421516
hist(data$dmi_max_egreso_diario)
quantile(data$dmi_max_egreso_diario, seq(0,1,by = 0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 11399.35
## 36% 37% 38% 39% 40% 41%
## 57635.56 76406.00 282165.00 546344.40 848246.40 1120309.30
## 42% 43% 44% 45% 46% 47%
## 1278535.00 1506000.00 1586002.00 1800000.00 1808016.60 2005671.00
## 48% 49% 50% 51% 52% 53%
## 2071476.36 2412097.00 2509287.50 2788249.00 2999088.00 3012780.00
## 54% 55% 56% 57% 58% 59%
## 3329386.00 3514000.00 3614412.00 3790122.86 3917983.00 3993058.06
## 60% 61% 62% 63% 64% 65%
## 4314267.60 4329499.00 4760574.00 5179535.80 5434157.40 5609550.35
## 66% 67% 68% 69% 70% 71%
## 6002005.00 6191910.48 6386272.00 6571371.65 6966758.20 7028000.00
## 72% 73% 74% 75% 76% 77%
## 7321076.92 7619654.00 7970963.00 8127383.75 8774232.00 8875431.00
## 78% 79% 80% 81% 82% 83%
## 9546222.00 9937420.51 10054365.60 10794511.00 11621264.60 11949578.85
## 84% 85% 86% 87% 88% 89%
## 12373146.00 13004433.00 13213167.00 13285191.00 14333466.00 15038382.25
## 90% 91% 92% 93% 94% 95%
## 15383235.00 15568797.00 16518175.44 18115766.45 20069227.10 20599199.10
## 96% 97% 98% 99% 100%
## 22645335.40 24225138.55 31233119.30 35903859.00 82421516.00
x <- subset(data$dmi_max_egreso_diario, data$dmi_max_egreso_diario <= 85362328)
hist(x)
dmi_max_egreso_diario <- data$dmi_max_egreso_diario
names(data)[40] # cpc_max_proc_deuda: Máximo del porcentaje de la deuda en el mes anterior
## [1] "cpc_max_proc_deuda"
summary(data$cpc_max_proc_deuda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -28.182000 0.000000 0.003515 0.158809 0.847877 1.272912
names(data)[41] # dh_cant_otros_d: Cantidad de trasacciones de salida por concepto de otros
## [1] "dh_cant_otros_d"
summary(data$dh_cant_otros_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 3.204 5.000 28.000
hist(data$dh_cant_otros_d)
hist(sqrt(data$dh_cant_otros_d))
dh_cant_otros_d <- sqrt(data$dh_cant_otros_d)
names(data)[42] # pc_cont_30_lt_12m_tot_sf: Cantidad mora 30 últimos 12 meses sector financiero
## [1] "pc_cont_30_lt_12m_tot_sf"
summary(data$pc_cont_30_lt_12m_tot_sf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
hist(data$pc_cont_30_lt_12m_tot_sf)
table(data$pc_cont_30_lt_12m_tot_sf)
##
## 0
## 902
names(data)[43] # pc_cant_mora90_ult_12m_total
## [1] "pc_cant_mora90_ult_12m_total"
table(data$pc_cant_mora90_ult_12m_total)
##
## 0 1 2 3 4 5 6 7 12
## 856 16 2 1 4 8 3 11 1
table(ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0))
##
## 0 1
## 856 46
x <- ifelse(data$pc_cant_mora90_ult_12m_total >= 1, 1, 0)
flag_tuvo_mora90_ult12M <- x
names(data)[44] # dmi_max_ingreso_diario: Maximo ingreso en un dia del mes anterior
## [1] "dmi_max_ingreso_diario"
summary(data$dmi_max_ingreso_diario)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 2534003 6832447 10000000 99410001
hist(data$dmi_max_ingreso_diario)
quantile(data$dmi_max_ingreso_diario, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 2.35
## 36% 37% 38% 39% 40% 41%
## 309.00 615.78 50000.00 354796.22 835472.40 1000000.00
## 42% 43% 44% 45% 46% 47%
## 1200000.00 1485150.36 1648761.12 1944331.70 2045478.26 2123515.51
## 48% 49% 50% 51% 52% 53%
## 2337440.00 2402294.67 2534003.00 2984507.57 3000006.56 3215770.00
## 54% 55% 56% 57% 58% 59%
## 3460925.00 3622502.05 3843849.28 3900211.00 4207500.00 4407615.09
## 60% 61% 62% 63% 64% 65%
## 4651730.20 5067101.17 5500000.00 5728921.55 6019212.60 6209045.40
## 66% 67% 68% 69% 70% 71%
## 6399660.00 6875000.00 7000000.00 7200000.00 7348904.00 8000000.00
## 72% 73% 74% 75% 76% 77%
## 8378862.00 9052866.00 9986436.52 10000000.00 10000000.00 10000000.00
## 78% 79% 80% 81% 82% 83%
## 10257019.00 10638016.52 11216000.00 11974437.86 13164000.00 13445811.23
## 84% 85% 86% 87% 88% 89%
## 13700001.00 14357715.70 14950272.84 15830394.00 17779183.08 18490095.00
## 90% 91% 92% 93% 94% 95%
## 19175443.00 19495802.00 20801484.44 23791026.00 25000000.00 26902125.00
## 96% 97% 98% 99% 100%
## 33287967.36 35678300.00 39273879.36 41724001.00 99410001.00
x <- subset(data$dmi_max_ingreso_diario, data$dmi_max_ingreso_diario <= 104909227.3)
hist(x)
dmi_max_ingreso_diario <- data$dmi_max_ingreso_diario
names(data)[45] # dh_val_otros_d: Valor total de las trasacciones de salida por concepto de otros en un mes
## [1] "dh_val_otros_d"
summary(data$dh_val_otros_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 12586 802098 697294 9777278
hist(data$dh_val_otros_d)
quantile(data$dh_val_otros_d, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 7% 8% 9% 10% 11% 12% 13%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 14% 15% 16% 17% 18% 19% 20%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 21% 22% 23% 24% 25% 26% 27%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 28% 29% 30% 31% 32% 33% 34%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 35% 36% 37% 38% 39% 40% 41%
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## 42% 43% 44% 45% 46% 47% 48%
## 0.00 0.00 0.00 0.00 0.00 1610.00 2756.40
## 49% 50% 51% 52% 53% 54% 55%
## 10100.00 12586.00 13271.00 19601.04 20200.00 28692.00 38891.20
## 56% 57% 58% 59% 60% 61% 62%
## 49211.00 56686.15 60420.00 68691.68 76824.00 119253.19 133905.00
## 63% 64% 65% 66% 67% 68% 69%
## 174475.03 201200.00 233389.00 271140.00 309311.93 332894.00 349316.00
## 70% 71% 72% 73% 74% 75% 76%
## 407576.50 452845.97 483246.00 513799.14 596183.08 697293.75 713901.68
## 77% 78% 79% 80% 81% 82% 83%
## 827700.00 928996.18 1033514.61 1102974.80 1127520.00 1271960.24 1384444.00
## 84% 85% 86% 87% 88% 89% 90%
## 1648990.52 1716832.70 1872117.00 2066960.98 2388746.72 2564671.00 2652146.00
## 91% 92% 93% 94% 95% 96% 97%
## 2956094.31 3320259.56 3663815.21 4115774.00 5073191.50 5825378.92 6362258.37
## 98% 99% 100%
## 7211642.08 7645053.36 9777278.00
x <- subset(data$dh_val_otros_d, data$dh_val_otros_d <= 9998335.80)
hist(x)
dh_val_otros_d <- data$dh_val_otros_d
names(data)[46] # pc_ingreso_final: Ingreso de final del cliente
## [1] "pc_ingreso_final"
summary(data$pc_ingreso_final)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3315000 8362472 14053377 18708862 87552251
hist(data$pc_ingreso_final)
pc_ingreso_final <- data$pc_ingreso_final
names(data)[47] # dh_cant_pagos_d: Cantidad de pagos de salidas tuvo en el mes anterior
## [1] "dh_cant_pagos_d"
summary(data$dh_cant_pagos_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 3.500 5.346 8.000 35.000
hist(data$dh_cant_pagos_d)
dh_cant_pagos_d <- data$dh_cant_pagos_d
names(data)[48] # dmi_ingreso_total_mes: Ingreso total del mes anterior
## [1] "dmi_ingreso_total_mes"
summary(data$dmi_ingreso_total_mes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 5685000 17657538 25060713 181987436
hist(data$dmi_ingreso_total_mes)
quantile(data$dmi_ingreso_total_mes, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 3.70
## 36% 37% 38% 39% 40% 41%
## 317.28 1592.74 53032.00 520933.81 1001004.40 1724907.50
## 42% 43% 44% 45% 46% 47%
## 2240001.00 2520715.00 3185149.60 3571411.35 4411157.02 4817541.00
## 48% 49% 50% 51% 52% 53%
## 5160000.00 5461628.03 5685000.00 6531800.00 7248359.04 7793376.50
## 54% 55% 56% 57% 58% 59%
## 8270000.00 8530406.00 9188379.16 9658108.00 10416391.60 11189637.61
## 60% 61% 62% 63% 64% 65%
## 11948978.60 12418588.01 13000000.00 13673026.68 14513000.00 15067301.05
## 66% 67% 68% 69% 70% 71%
## 16128330.00 16748600.00 18163873.72 18547426.92 20132597.40 21500000.00
## 72% 73% 74% 75% 76% 77%
## 22000000.00 22528017.00 23377940.20 25060713.00 27384078.44 28400354.79
## 78% 79% 80% 81% 82% 83%
## 30557407.96 31241943.45 31844647.40 31944284.00 34174945.00 36977650.33
## 84% 85% 86% 87% 88% 89%
## 39152130.00 43148753.00 46185407.00 48841121.03 52888577.60 53816089.00
## 90% 91% 92% 93% 94% 95%
## 56323174.00 60959669.06 66736925.00 67675419.00 68100000.00 71880165.00
## 96% 97% 98% 99% 100%
## 73308273.00 79961357.24 88862497.06 109768839.74 181987436.00
x <- subset(data$dmi_ingreso_total_mes, data$dmi_ingreso_total_mes <= 205163705.0)
hist(x)
dmi_ingreso_total_mes <- data$dmi_ingreso_total_mes
names(data)[49] # dh_val_pagos_d: Valor total de las trasacciones de salida por concepto de pagos otros
## [1] "dh_val_pagos_d"
summary(data$dh_val_pagos_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 537640 3639828 4661394 46719192
hist(data$dh_val_pagos_d)
quantile(data$dh_val_pagos_d, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 36% 37% 38% 39% 40% 41%
## 0.00 0.00 75.34 42343.00 56773.00 69144.00
## 42% 43% 44% 45% 46% 47%
## 91390.00 151658.00 159690.00 165118.55 178014.00 179314.00
## 48% 49% 50% 51% 52% 53%
## 306936.36 318125.77 537640.00 678424.39 806356.20 1059441.20
## 54% 55% 56% 57% 58% 59%
## 1212291.94 1330827.25 1560711.00 1662900.00 1823001.70 1999493.24
## 60% 61% 62% 63% 64% 65%
## 2085516.80 2220129.00 2315083.28 2490672.54 2778419.00 3040516.20
## 66% 67% 68% 69% 70% 71%
## 3277671.00 3344574.00 3409709.20 3536084.43 3600516.00 3766572.79
## 72% 73% 74% 75% 76% 77%
## 3900425.24 4283458.20 4490363.42 4661393.75 4973548.56 5176261.00
## 78% 79% 80% 81% 82% 83%
## 5271188.86 5399363.49 5707806.00 6166031.00 6853466.00 7095437.00
## 84% 85% 86% 87% 88% 89%
## 7265797.40 7453799.00 7733374.12 8210036.00 9130377.00 9750499.50
## 90% 91% 92% 93% 94% 95%
## 10373706.00 10980804.97 12745205.00 13360645.10 13522747.00 14262631.00
## 96% 97% 98% 99% 100%
## 15040059.56 21305241.00 27212693.58 36549357.01 46719192.00
x <- subset(data$dh_val_pagos_d, data$dh_val_pagos_d <= 50357360.00)
hist(x)
dh_val_pagos_d <- data$dh_val_pagos_d
names(data)[50] # pc_gasto_familiar: Valor Gasto de familiar del cliente
## [1] "pc_gasto_familiar"
summary(data$pc_gasto_familiar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 998131 2284566 3596369 4568750 21888063
quantile(data$pc_gasto_familiar, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 63033.82 350000.00 491062.78 595099.36
## 18% 19% 20% 21% 22% 23%
## 665194.70 765000.00 829934.46 844698.40 864358.37 905320.38
## 24% 25% 26% 27% 28% 29%
## 943500.00 998130.70 1053864.93 1086803.67 1107646.36 1170000.00
## 30% 31% 32% 33% 34% 35%
## 1235790.67 1307972.22 1401226.17 1435756.50 1449533.38 1564995.31
## 36% 37% 38% 39% 40% 41%
## 1584240.00 1629589.42 1630082.22 1637151.00 1654248.75 1680000.00
## 42% 43% 44% 45% 46% 47%
## 1710290.10 1732919.27 1773123.19 1800533.81 1852381.20 2042659.37
## 48% 49% 50% 51% 52% 53%
## 2118915.12 2232519.95 2284565.98 2314858.89 2397248.78 2446335.13
## 54% 55% 56% 57% 58% 59%
## 2510217.45 2530766.61 2568167.58 2705762.50 2708752.54 2759275.31
## 60% 61% 62% 63% 64% 65%
## 2778750.00 2853154.49 2990963.55 3240028.70 3262926.88 3304375.00
## 66% 67% 68% 69% 70% 71%
## 3400000.00 3557347.50 3738034.50 3850755.50 3981578.82 4232305.25
## 72% 73% 74% 75% 76% 77%
## 4258569.80 4349935.73 4473926.14 4568750.00 4635750.00 4665646.00
## 78% 79% 80% 81% 82% 83%
## 5079723.23 5154531.56 5214205.69 5296256.50 5296256.50 5660150.00
## 84% 85% 86% 87% 88% 89%
## 5976224.40 6430000.00 7311417.07 7509873.84 8004993.21 8217756.65
## 90% 91% 92% 93% 94% 95%
## 9000354.71 9106196.62 9403175.00 10619436.29 12488403.37 13932843.75
## 96% 97% 98% 99% 100%
## 14228463.97 15245233.01 19531282.01 19549158.17 21888062.75
x <- subset(data$pc_gasto_familiar, data$pc_gasto_familiar <= 26582180.19)
hist(x)
pc_gasto_familiar <- data$pc_gasto_familiar
names(data)[51] # pc_cuotas_pagadas: Valor Cuotas de pagadas del cliente
## [1] "pc_cuotas_pagadas"
summary(data$pc_cuotas_pagadas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 343148 913284 1373781 7184545
hist(data$pc_cuotas_pagadas)
quantile(data$pc_cuotas_pagadas, seq(0, 1, by = 0.01))
## 0% 1% 2% 3% 4% 5% 6% 7%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 8% 9% 10% 11% 12% 13% 14% 15%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 16% 17% 18% 19% 20% 21% 22% 23%
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## 24% 25% 26% 27% 28% 29% 30% 31%
## 0.0 0.0 0.0 0.0 130.0 62150.0 87000.0 116000.0
## 32% 33% 34% 35% 36% 37% 38% 39%
## 143028.0 145000.0 179265.8 213500.0 220000.0 242425.2 258000.0 258000.0
## 40% 41% 42% 43% 44% 45% 46% 47%
## 258000.0 258000.0 258000.0 275855.6 280000.0 286810.0 294600.0 300000.0
## 48% 49% 50% 51% 52% 53% 54% 55%
## 332317.2 333200.0 343148.2 364200.7 384200.0 384698.3 400000.0 429782.6
## 56% 57% 58% 59% 60% 61% 62% 63%
## 473000.0 490000.0 504164.1 516000.0 543636.1 543636.1 646224.7 648815.1
## 64% 65% 66% 67% 68% 69% 70% 71%
## 676982.9 806622.1 847182.0 847182.0 855700.0 860000.0 874639.5 928418.7
## 72% 73% 74% 75% 76% 77% 78% 79%
## 1065817.2 1120496.9 1354695.3 1373781.0 1484196.6 1503863.1 1730412.1 1737940.1
## 80% 81% 82% 83% 84% 85% 86% 87%
## 1769838.0 1808000.0 1840035.3 1855943.0 1885749.1 1891250.5 2117686.3 2132206.4
## 88% 89% 90% 91% 92% 93% 94% 95%
## 2283470.0 2451854.4 2492565.3 2493312.4 2865653.8 2983499.5 3017035.4 3234839.9
## 96% 97% 98% 99% 100%
## 3303345.8 4616921.3 5167146.6 7155587.2 7184544.5
x <- subset(data$pc_cuotas_pagadas, data$pc_cuotas_pagadas <= 8156850.0)
hist(x)
pc_cuotas_pagadas <- data$pc_cuotas_pagadas
names(data)[52] # cpc_avg_proc_deuda: Promedio del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_avg_proc_deuda"
summary(data$cpc_avg_proc_deuda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -28.182000 0.000000 0.003075 0.136219 0.799321 1.224909
hist(data$cpc_avg_proc_deuda)
quantile(data$cpc_avg_proc_deuda, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4%
## -28.182000000 -0.021213000 -0.003272055 0.000000000 0.000000000
## 5% 6% 7% 8% 9%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 10% 11% 12% 13% 14%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 15% 16% 17% 18% 19%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 20% 21% 22% 23% 24%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 25% 26% 27% 28% 29%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 30% 31% 32% 33% 34%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 35% 36% 37% 38% 39%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 40% 41% 42% 43% 44%
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 45% 46% 47% 48% 49%
## 0.000000000 0.002474419 0.002631783 0.002631783 0.002631783
## 50% 51% 52% 53% 54%
## 0.003075038 0.018633346 0.059156000 0.096896600 0.115248053
## 55% 56% 57% 58% 59%
## 0.168734000 0.215616247 0.246319000 0.277754880 0.312850843
## 60% 61% 62% 63% 64%
## 0.362930164 0.402114125 0.427537092 0.442012067 0.478433000
## 65% 66% 67% 68% 69%
## 0.545081738 0.582370425 0.594232417 0.640208326 0.684193917
## 70% 71% 72% 73% 74%
## 0.700694736 0.746537701 0.757732590 0.776391909 0.784705667
## 75% 76% 77% 78% 79%
## 0.799320623 0.824395833 0.830392909 0.834281000 0.842780631
## 80% 81% 82% 83% 84%
## 0.853675575 0.858839117 0.868951947 0.895844211 0.929755381
## 85% 86% 87% 88% 89%
## 0.962535958 0.966394034 0.973559686 0.980652725 0.990590504
## 90% 91% 92% 93% 94%
## 1.017928827 1.045238187 1.072557625 1.085973066 1.098219571
## 95% 96% 97% 98% 99%
## 1.099494833 1.111387925 1.119784114 1.149961810 1.193426598
## 100%
## 1.224909400
names(data)[53] # cpc_sum_proc_deuda: Suma del porcentaje de las deudas de un cliente en el mes anterior (Porcentaje : Razón entre el saldo y el valor incial)
## [1] "cpc_sum_proc_deuda"
summary(data$cpc_sum_proc_deuda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -28.182000 0.000000 0.003075 0.136219 0.799321 1.224909
hist(data$cpc_sum_proc_deuda)
names(data)[54] # dc_porc_prod_sin_mora: Porcentaje de productos sin mora en todo el sistema
## [1] "dc_porc_prod_sin_mora"
summary(data$dc_porc_prod_sin_mora)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.005543 0.000000 1.000000
hist(data$dc_porc_prod_sin_mora)
quantile(data$dc_porc_prod_sin_mora, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 96% 97% 98% 99% 100%
## 0 0 0 0 1
names(data)[55] # pc_ingreso_rutina_con_techo: Ingreso por rutina una vez aplicada los techos por segmento
## [1] "pc_ingreso_rutina_con_techo"
summary(data$pc_ingreso_rutina_con_techo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3550276 8811787 15065481 20470286 87552251
hist(data$pc_ingreso_rutina_con_techo)
pc_ingreso_rutina_con_techo <- data$pc_ingreso_rutina_con_techo
names(data)[56] # pc_saldo_prom3_tdc_entidad: Saldo promedio de los últimos 3 meses de tarjeta de crédito en el banco
## [1] "pc_saldo_prom3_tdc_entidad"
summary(data$pc_saldo_prom3_tdc_entidad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 5652 0 3070000
hist(data$pc_saldo_prom3_tdc_entidad)
names(data)[57] # dh_cant_salidas: Cantidad de trasacciones de salida de dinero en un mes
## [1] "dh_cant_salidas"
summary(data$dh_cant_salidas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 19.00 28.66 49.00 184.00
hist(data$dh_cant_salidas)
dh_cant_salidas <- data$dh_cant_salidas
names(data)[58] # dh_min_dia_pagos_d: Primer dia en el que realizó algún credito en el mes anterior
## [1] "dh_min_dia_pagos_d"
summary(data$dh_min_dia_pagos_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 3.171 3.000 31.000
hist(data$dh_min_dia_pagos_d)
dh_min_dia_pagos_d <- data$dh_min_dia_pagos_d
names(data)[59] # pc_ingreso_por_rutina: Ingreso por rutina
## [1] "pc_ingreso_por_rutina"
summary(data$pc_ingreso_por_rutina)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3550276 8811787 15083531 20470286 87552251
hist(data$pc_ingreso_por_rutina)
pc_ingreso_por_rutina <- data$pc_ingreso_por_rutina
names(data)[60] # dh_min_dia_pago_tarj_d: Primer dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_min_dia_pago_tarj_d"
hist(data$dh_min_dia_pago_tarj_d)
names(data)[61] # cp_nro_cuota: Numero de cuota pactadas por producto
## [1] "cp_nro_cuota"
summary(data$cp_nro_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[62] # dmi_egreso_total_mes: Egreso total del mes anterior
## [1] "dmi_egreso_total_mes"
summary(data$dmi_egreso_total_mes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 6710852 18203268 24720956 191314996
hist(data$dmi_egreso_total_mes)
quantile(data$dmi_egreso_total_mes, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 11399.35
## 36% 37% 38% 39% 40% 41%
## 65750.00 144062.00 476365.00 857438.55 1373997.80 2032524.46
## 42% 43% 44% 45% 46% 47%
## 2425098.10 3147436.28 3637537.36 4297695.00 4985629.00 5242864.00
## 48% 49% 50% 51% 52% 53%
## 5768836.60 6471482.00 6710852.00 7234899.00 7941029.00 8321595.04
## 54% 55% 56% 57% 58% 59%
## 9066380.48 9451729.00 10147091.00 11474139.00 11927600.00 12472455.70
## 60% 61% 62% 63% 64% 65%
## 13236143.00 13635595.00 14447329.22 15199359.70 16050052.32 16698889.00
## 66% 67% 68% 69% 70% 71%
## 17675826.00 18687460.99 19390695.00 20624901.00 21419482.00 21608609.00
## 72% 73% 74% 75% 76% 77%
## 22283315.00 23681242.17 23844136.20 24720956.00 25905897.48 27633120.00
## 78% 79% 80% 81% 82% 83%
## 29004586.56 30127755.34 31398939.00 31492755.00 32045289.72 35278365.00
## 84% 85% 86% 87% 88% 89%
## 40680145.00 43554385.50 45389761.66 48031045.00 52135165.00 56926454.85
## 90% 91% 92% 93% 94% 95%
## 60640219.00 64217571.00 66239075.00 68137836.00 71343296.66 78399975.45
## 96% 97% 98% 99% 100%
## 80045627.00 80352549.82 89004556.92 111049163.72 191314996.00
x <- subset(data$dmi_egreso_total_mes, data$dmi_egreso_total_mes <= 209829245.0)
hist(x)
dmi_egreso_total_mes <- data$dmi_egreso_total_mes
names(data)[63] # cp_valor_inicial: valor inicial de la obligacion por producto
## [1] "cp_valor_inicial"
summary(data$cp_valor_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[64] # dh_max_dia_otros_d: Ultimo dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_max_dia_otros_d"
hist(data$dh_max_dia_otros_d)
x <- ifelse(data$dh_max_dia_otros_d <= 15, 1, 0)
table(x)
## x
## 0 1
## 387 515
flax_maxDiaOtrosD_1quincena <- x
names(data)[65] # p_cuota_sobre_saldo: Valor de la cuota sobre saldo por producto
## [1] "cp_cuota_sobre_saldo"
summary(data$cp_cuota_sobre_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[66] # dc_sum_valor_inicial: Suma valores iniciales de obligaciones en todo el sistema financiero
## [1] "dc_sum_valor_inicial"
summary(data$dc_sum_valor_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 501971 0 310276000
hist(data$dc_sum_valor_inicial)
names(data)[67] # dh_cant_pago_tarj_d: Cantidad de trasacciones de salida por concepto de pago de tarjeta de credito
## [1] "dh_cant_pago_tarj_d"
summary(data$dh_cant_pago_tarj_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.9678 1.0000 33.0000
hist(data$dh_cant_pago_tarj_d)
names(data)[68] # dh_max_dia_pagos_d: Ultimo dia en el que realizó algún credito en el mes anterior
## [1] "dh_max_dia_pagos_d"
hist(data$dh_max_dia_pagos_d)
x <- ifelse(data$dh_max_dia_pagos_d <= 15, 1, 0)
table(x)
## x
## 0 1
## 505 397
flag_maxDiaPagos_d_1quincena <- x
names(data)[69] # cp_saldo_sobre_inicial: valor del saldo sobre inicial por producto mes anterior
## [1] "cp_saldo_sobre_inicial"
summary(data$cp_saldo_sobre_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[70] # pc_mediana_nom3: Mediana de los últimos 3 pagos nómina para cálculo de retanqueo libranza
## [1] "pc_mediana_nom3"
summary(data$pc_mediana_nom3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 50084 0 4287978
hist(data$pc_mediana_nom3)
names(data)[71] # cp_esta_cuota_otro: Es el estado de la cuota otro
## [1] "cp_esta_cuota_otro"
summary(data$cp_esta_cuota_otro)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[72] # dh_max_dia_retiros_d: Ultimo dia en el que realizó reitro en el mes anterior
## [1] "dh_max_dia_retiros_d"
hist(data$dh_max_dia_retiros_d)
# variable incorrecta, el cuartil 3 es 30 y un mes tiene hasta 31 días
names(data)[73] # dh_avg_dia_entradas: Dia promedio del mes en el recibe las entradas de dinero
## [1] "dh_avg_dia_entradas"
summary(data$dh_avg_dia_entradas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 4.585 4.457 6.557 31.000
hist(data$dh_avg_dia_entradas)
names(data)[74] # dh_avg_dia_pagos_d: Dia del mes promedio en el que hace las salidas de dinero por pagos
## [1] "dh_avg_dia_pagos_d"
x <- ifelse(data$dh_avg_dia_pagos_d <= 10, 1, 0)
table(x)
## x
## 0 1
## 1 901
names(data)[75] # dh_val_salidas: Valor total de las trasacciones de salida en un mes
## [1] "dh_val_salidas"
summary(data$dh_val_salidas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 6710852 18220966 24720956 191314996
hist(data$dh_val_salidas)
quantile(data$dh_val_salidas, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 11399.34 38171.30
## 36% 37% 38% 39% 40% 41%
## 97964.00 275655.54 524117.36 1168205.09 1781441.00 2117572.83
## 42% 43% 44% 45% 46% 47%
## 2514844.00 3349365.00 3817628.00 4412149.25 5013252.92 5242864.00
## 48% 49% 50% 51% 52% 53%
## 5768836.60 6471482.00 6710852.00 7234899.00 7941029.00 8321595.04
## 54% 55% 56% 57% 58% 59%
## 9066380.48 9451729.00 10147091.00 11474139.00 11927600.00 12472455.70
## 60% 61% 62% 63% 64% 65%
## 13236143.00 13635595.00 14447329.22 15199359.70 16050052.32 16698889.00
## 66% 67% 68% 69% 70% 71%
## 17675826.00 18687460.99 19390695.00 20624901.00 21419482.00 21608609.00
## 72% 73% 74% 75% 76% 77%
## 22283315.00 23681242.17 23844136.20 24720956.00 25905897.48 27633120.00
## 78% 79% 80% 81% 82% 83%
## 29004586.56 30127755.34 31398939.00 31492755.00 32045289.72 35278365.00
## 84% 85% 86% 87% 88% 89%
## 40680145.00 43554385.50 45389761.66 48031045.00 52135165.00 56926454.85
## 90% 91% 92% 93% 94% 95%
## 60640219.00 64217571.00 66239075.00 68137836.00 71343296.66 78399975.45
## 96% 97% 98% 99% 100%
## 80045627.00 80352549.82 89004556.92 111049163.72 191314996.00
x <- subset(data$dh_val_salidas, data$dh_val_salidas <= 217011100)
hist(x)
dh_val_salidas <- data$dh_val_salidas
names(data)[76] # dc_sum_valor_cuota: Valor total de las trasacciones de salida en un mes
## [1] "dc_sum_valor_cuota"
summary(data$dc_sum_valor_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 33407 0 13360000
hist(data$dc_sum_valor_cuota)
names(data)[77] # dh_min_dia_tras_d: Primer dia en el que realizó pago de traslado en el mes anterior
## [1] "dh_min_dia_tras_d"
hist(data$dh_min_dia_tras_d)
x <- ifelse(data$dh_min_dia_tras_d <= 10, 1, 0)
table(x)
## x
## 0 1
## 99 803
names(data)[78] # cp_porc_valorcuot_ing: Relación entre el valor de la cuota sobre los ingresos por producto
## [1] "cp_porc_valorcuot_ing"
summary(data$cp_porc_valorcuot_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[79] # pc_ind_ajustado: Ingreso neto disponible del cliente ajustado
## [1] "pc_ind_ajustado"
summary(data$pc_ind_ajustado)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4115281 994836 3591609 6805845 9008932 46552980
hist(data$pc_ind_ajustado)
quantile(data$pc_ind_ajustado, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -4115280.80 -1826179.99 -289405.70 -96562.97 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 48003.75
## 18% 19% 20% 21% 22% 23%
## 109162.44 190428.75 246883.42 538385.69 648167.69 799010.24
## 24% 25% 26% 27% 28% 29%
## 859590.00 994835.77 1157424.69 1289365.08 1350754.47 1404675.00
## 30% 31% 32% 33% 34% 35%
## 1480053.89 1587621.36 1587621.36 1650235.75 1822835.97 1879208.30
## 36% 37% 38% 39% 40% 41%
## 1948169.40 2003917.50 2154537.00 2518881.75 2677500.00 2822090.15
## 42% 43% 44% 45% 46% 47%
## 2911943.92 3091390.95 3190972.67 3190972.67 3241722.38 3241722.38
## 48% 49% 50% 51% 52% 53%
## 3413106.65 3505538.87 3591609.21 3591609.21 3872384.69 4317499.00
## 54% 55% 56% 57% 58% 59%
## 4357215.99 4474653.75 4668131.63 4848009.69 4974054.87 5360090.10
## 60% 61% 62% 63% 64% 65%
## 5741412.68 5741412.68 5777751.94 5859048.42 5895596.49 6025889.71
## 66% 67% 68% 69% 70% 71%
## 6098398.45 6318411.44 6806462.97 7365699.76 7610175.00 7897292.18
## 72% 73% 74% 75% 76% 77%
## 8258805.03 8432540.03 8888342.46 9008932.31 10100269.49 10443540.70
## 78% 79% 80% 81% 82% 83%
## 11475000.00 11744826.48 11744826.48 11854694.27 12430296.87 13127593.73
## 84% 85% 86% 87% 88% 89%
## 13154771.40 14039078.44 14122015.13 15690171.27 15790025.89 18134416.04
## 90% 91% 92% 93% 94% 95%
## 18941384.29 21621182.46 22345186.29 23339886.01 23429767.22 24527785.16
## 96% 97% 98% 99% 100%
## 26203986.86 30567968.50 30946818.46 36547579.52 46552980.18
x <- subset(data$pc_ind_ajustado, data$pc_ind_ajustado <= 68107989.8)
hist(x)
names(data)[80] # dh_val_retiros_d: Valor total de las trasacciones de salida por concepto de retiros en un mes
## [1] "dh_val_retiros_d"
summary(data$dh_val_retiros_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 1048890 8526310 10895497 164054718
hist(data$dh_val_retiros_d)
names(data)[81] # pc_tiem_lt_prod_abie_total: Tiempo último producto abierto
## [1] "pc_tiem_lt_prod_abie_total"
summary(data$pc_tiem_lt_prod_abie_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03104 0.00000 9.00000
hist(data$pc_tiem_lt_prod_abie_total)
names(data)[82] # marca_info_cifin_decode: Marca Cifin(Consultado, no consultado, no encontrado, etc)
## [1] "marca_info_cifin_decode"
table(data$marca_info_cifin_decode)
##
## 0 1 2
## 645 5 252
x <- ifelse(data$marca_info_cifin_decode == 0, 1, 0)
table(x)
## x
## 0 1
## 257 645
flag_encontrado_cifin <- x
names(data)[83] # dh_max_dia_pago_tarj_d: Ultimo dia en el que pago la tarjeta de credito en el mes anterior
## [1] "dh_max_dia_pago_tarj_d"
hist(data$dh_max_dia_pago_tarj_d)
names(data)[84] # dc_valobli_ing: Suma del valor inicial de las obligaciones en el sistema financiero sobre los ingresos
## [1] "dc_valobli_ing"
summary(data$dc_valobli_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03364 0.00000 23.86188
hist(data$dc_valobli_ing)
names(data)[85] # pc_cantidad_tdc_entidad: Cantidad tarjetas de crédito en el banco
## [1] "pc_cantidad_tdc_entidad"
table(data$pc_cantidad_tdc_entidad)
##
## 0 1 2
## 899 2 1
names(data)[86] # dh_min_dia_otros_d: Primer dia en el que realizó algún débito clasificado como "otros" en el mes anterior
## [1] "dh_min_dia_otros_d"
hist(data$dh_min_dia_otros_d)
x <- ifelse(data$dh_min_dia_otros_d <= 15, 1, 0)
table(x)
## x
## 0 1
## 65 837
names(data)[87] # dc_cant_obligaciones: Cantidad de obligaciones
## [1] "dc_cant_obligaciones"
table(data$dc_cant_obligaciones)
##
## 0 3 8 10 15
## 897 1 2 1 1
summary(data$dc_cant_obligaciones)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04878 0.00000 15.00000
hist(data$dc_cant_obligaciones)
x <- ifelse(data$dc_cant_obligaciones >= 1, 1, 0)
table(x)
## x
## 0 1
## 897 5
names(data)[88] # cpc_sum_nro_cuota: Suma de los número de cuotas de todas las obligaciones del cliente
## [1] "cpc_sum_nro_cuota"
summary(data$cpc_sum_nro_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 25.5 93.3 134.0 778.0
names(data)[89] # cpc_avg_nro_cuota: Promedio del número de cuotas entre todos los productos del cliente
## [1] "cpc_avg_nro_cuota"
summary(data$cpc_avg_nro_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 20.33 39.96 66.38 305.50
hist(data$cpc_avg_nro_cuota)
cpc_avg_nro_cuota <- data$cpc_avg_nro_cuota
names(data)[90] # cpc_max_nro_cuota: Número de cuotas máximo de un cliente entre todos sus productos
## [1] "cpc_max_nro_cuota"
names(data)[91] # cp_saldo: Valor del saldo del mes anterior por producto
## [1] "cp_saldo"
summary(data$cp_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[92] # cp_cuota_sobre_inicial: valor de la cuota sobre el valor desembolsado incial por producto
## [1] "cp_cuota_sobre_inicial"
summary(data$cp_cuota_sobre_inicial)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[93] # cpc_sum_saldo: Suma del saldo de todos los productos activos del cliente
## [1] "cpc_sum_saldo"
summary(data$cpc_sum_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -70455 0 33950 12216024 9920492 146717108
hist(data$cpc_sum_saldo)
quantile(data$cpc_sum_saldo, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -70455.00 -65826.00 -19656.58 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 36% 37% 38% 39% 40% 41%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 42% 43% 44% 45% 46% 47%
## 0.00 0.00 0.00 0.00 24490.00 31920.00
## 48% 49% 50% 51% 52% 53%
## 33950.00 33950.00 33950.00 237879.40 472658.20 738957.00
## 54% 55% 56% 57% 58% 59%
## 985207.00 1420982.55 1946855.76 2051116.00 2437914.82 2604606.00
## 60% 61% 62% 63% 64% 65%
## 2704777.80 3033503.54 3199161.00 3792700.64 4372629.00 4874872.00
## 66% 67% 68% 69% 70% 71%
## 5131967.56 5649048.00 7246359.24 7536951.00 7706847.00 8010945.00
## 72% 73% 74% 75% 76% 77%
## 8457241.00 8746891.55 9134322.00 9920492.50 11517312.00 12052637.97
## 78% 79% 80% 81% 82% 83%
## 12249094.00 14379750.00 15182781.20 16772655.45 17733736.74 18370147.00
## 84% 85% 86% 87% 88% 89%
## 19785500.00 23100863.00 23505359.00 26303831.91 27975363.72 29763054.02
## 90% 91% 92% 93% 94% 95%
## 33065306.00 36919522.00 47807424.63 73378700.49 78844063.50 80180499.99
## 96% 97% 98% 99% 100%
## 83182148.00 84670677.73 112508218.03 128558087.07 146717108.20
x <- subset(data$cpc_sum_saldo, data$cpc_sum_saldo <= 220905770.00)
hist(x)
cpc_sum_saldo <- data$cpc_sum_saldo
names(data)[94] # cp_porc_saldo_ing: valor del saldo sobre ingreso por producto
## [1] "cp_porc_saldo_ing"
summary(data$cp_porc_saldo_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[95] # gsm_mejor_gestion: Mejor gestión realizada en el mes anterior
## [1] "gsm_mejor_gestion"
table(data$gsm_mejor_gestion)
##
## 0 16
## 901 1
names(data)[96] # dh_min_dia_nomina_c
## [1] "dh_min_dia_nomina_c"
names(data)[97] # dh_max_dia_nomina_c: Ultimo dia en el que recibió pago de nomina en el mes anterior
## [1] "dh_max_dia_nomina_c"
hist(data$dh_max_dia_nomina_c)
names(data)[98] # cp_valor_cuota: Valor de cuota por producto
## [1] "cp_valor_cuota"
summary(data$cp_valor_cuota)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
names(data)[99] # cpc_nro_cuota_tdc: Suma de los número de cuotas de todas las tarjetas de crédito del cliente
## [1] "cpc_nro_cuota_tdc"
names(data)[100] # gsm_prom_dias_gest: Promedio de los días en que se realizaron gestiones en el mes anterior
## [1] "gsm_prom_dias_gest"
names(data)[101] # pc_cuota_no_rot_ent: Cuota de productos no rotativos en el banco
## [1] "pc_cuota_no_rot_ent"
summary(data$pc_cuota_no_rot_ent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 1527 0 1377000
hist(data$pc_cuota_no_rot_ent)
names(data)[102] # dh_val_nomina_c: Valor total de las trasacciones de entrada por concepto de nomina en un mes
## [1] "dh_val_nomina_c"
summary(data$dh_val_nomina_c)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 60297 0 7256848
hist(data$dh_val_nomina_c)
quantile(data$dh_val_nomina_c, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9%
## 0 0 0 0 0 0 0 0 0 0
## 10% 11% 12% 13% 14% 15% 16% 17% 18% 19%
## 0 0 0 0 0 0 0 0 0 0
## 20% 21% 22% 23% 24% 25% 26% 27% 28% 29%
## 0 0 0 0 0 0 0 0 0 0
## 30% 31% 32% 33% 34% 35% 36% 37% 38% 39%
## 0 0 0 0 0 0 0 0 0 0
## 40% 41% 42% 43% 44% 45% 46% 47% 48% 49%
## 0 0 0 0 0 0 0 0 0 0
## 50% 51% 52% 53% 54% 55% 56% 57% 58% 59%
## 0 0 0 0 0 0 0 0 0 0
## 60% 61% 62% 63% 64% 65% 66% 67% 68% 69%
## 0 0 0 0 0 0 0 0 0 0
## 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 0 0 0 0 0 0 0 0 0 0
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89%
## 0 0 0 0 0 0 0 0 0 0
## 90% 91% 92% 93% 94% 95% 96% 97% 98% 99%
## 0 0 0 0 0 0 0 0 0 3227685
## 100%
## 7256848
names(data)[103] # banca_completa: Si el cliente pertenece al segmento banca completa o no
## [1] "banca_completa"
table(data$banca_completa)
##
## 0
## 902
names(data)[104] # cpc_saldo_sobre_ing: saldo sobre ingreso por cliente
## [1] "cpc_saldo_sobre_ing"
summary(data$cpc_saldo_sobre_ing)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.01392 0.00000 0.00392 0.75912 0.43980 11.63406
hist(data$cpc_saldo_sobre_ing)
quantile(data$cpc_saldo_sobre_ing, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4%
## -0.0139196450 -0.0026716620 -0.0004434407 0.0000000000 0.0000000000
## 5% 6% 7% 8% 9%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 10% 11% 12% 13% 14%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 15% 16% 17% 18% 19%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 20% 21% 22% 23% 24%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 25% 26% 27% 28% 29%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 30% 31% 32% 33% 34%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 35% 36% 37% 38% 39%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 40% 41% 42% 43% 44%
## 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## 45% 46% 47% 48% 49%
## 0.0000000000 0.0006086835 0.0036859120 0.0039203230 0.0039203230
## 50% 51% 52% 53% 54%
## 0.0039203230 0.0091222870 0.0160144150 0.0241167220 0.0342690350
## 55% 56% 57% 58% 59%
## 0.0516975909 0.0762671575 0.0871885689 0.0959601992 0.0996937647
## 60% 61% 62% 63% 64%
## 0.1100301990 0.1167548170 0.1181152130 0.1414183478 0.1527656520
## 65% 66% 67% 68% 69%
## 0.2038563155 0.2406806960 0.2590162859 0.2652612750 0.2722557190
## 70% 71% 72% 73% 74%
## 0.3197385828 0.3433782008 0.3739022130 0.3850143830 0.4290230500
## 75% 76% 77% 78% 79%
## 0.4397979330 0.5008516424 0.5767091904 0.6159902625 0.7295992450
## 80% 81% 82% 83% 84%
## 0.8236684390 0.8864274193 0.9934920512 1.0631846570 1.1306370686
## 85% 86% 87% 88% 89%
## 1.1725284910 1.2668796990 1.4941067817 1.8447025345 1.9151466040
## 90% 91% 92% 93% 94%
## 2.1176921894 2.1883557560 2.2844911496 2.9200289999 5.4915779623
## 95% 96% 97% 98% 99%
## 5.8321611310 6.7831752090 7.1059412270 7.4480707058 8.7215061950
## 100%
## 11.6340582200
x <- subset(data$cpc_saldo_sobre_ing, data$cpc_saldo_sobre_ing <= 20.307294513)
hist(x)
cpc_saldo_sobre_ing <- data$cpc_saldo_sobre_ing
names(data)[105] # dh_min_dia_pago_cred_d: Primer dia en el que realizó pago de credito en el mes anterior
## [1] "dh_min_dia_pago_cred_d"
hist(data$dh_min_dia_pago_cred_d)
dh_min_dia_pago_cred_d <- data$dh_min_dia_pago_cred_d
names(data)[106] # cpc_saldo_tdc: Saldo total en tarjetas de crédito del cliente
## [1] "cpc_saldo_tdc"
summary(data$cpc_saldo_tdc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -70455 0 33950 5068261 8010945 36919522
hist(data$cpc_saldo_tdc)
quantile(data$cpc_saldo_tdc, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -70455.00 -65826.00 -20000.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 36% 37% 38% 39% 40% 41%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 42% 43% 44% 45% 46% 47%
## 0.00 0.00 0.00 0.00 0.00 9483.00
## 48% 49% 50% 51% 52% 53%
## 31920.00 33950.00 33950.00 33950.00 40620.92 242390.33
## 54% 55% 56% 57% 58% 59%
## 350295.68 517486.50 738957.00 985207.00 1425073.34 1883099.63
## 60% 61% 62% 63% 64% 65%
## 2025065.80 2296037.81 2585201.00 2642201.49 2905231.56 3156513.95
## 66% 67% 68% 69% 70% 71%
## 3494659.42 4090121.68 4507996.36 4949410.41 5262763.00 5664027.00
## 72% 73% 74% 75% 76% 77%
## 7246970.96 7536951.00 7706353.52 8010945.00 8318098.24 8616938.17
## 78% 79% 80% 81% 82% 83%
## 9134322.00 9296475.24 10437137.80 11181913.72 11687764.00 12248475.71
## 84% 85% 86% 87% 88% 89%
## 12852364.00 13440597.70 14740435.54 15089862.00 16685620.00 17175586.00
## 90% 91% 92% 93% 94% 95%
## 17905425.00 18549522.56 19785500.00 21713390.07 22926946.00 23100863.00
## 96% 97% 98% 99% 100%
## 24470898.84 26387876.00 28790737.34 31877765.63 36919522.00
x <- subset(data$cpc_saldo_tdc, data$cpc_saldo_tdc <= 43186883.96)
hist(x)
cpc_saldo_tdc <- data$cpc_saldo_tdc
names(data)[107] # pc_cuota_de_consumo: Cuota de crédito de consumo reportada por CIFIN
## [1] "pc_cuota_de_consumo"
summary(data$pc_cuota_de_consumo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 5096 0 3356000
hist(data$pc_cuota_de_consumo)
names(data)[108] # dh_cant_tras_d: Cantidad de traslados de salida de dinero en un mes
## [1] "dh_cant_tras_d"
hist(data$dh_cant_tras_d)
quantile(data$dh_cant_tras_d, seq(0,1,by=0.005))
## 0.0% 0.5% 1.0% 1.5% 2.0% 2.5% 3.0% 3.5% 4.0% 4.5% 5.0%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5.5% 6.0% 6.5% 7.0% 7.5% 8.0% 8.5% 9.0% 9.5% 10.0% 10.5%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 11.0% 11.5% 12.0% 12.5% 13.0% 13.5% 14.0% 14.5% 15.0% 15.5% 16.0%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 16.5% 17.0% 17.5% 18.0% 18.5% 19.0% 19.5% 20.0% 20.5% 21.0% 21.5%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 22.0% 22.5% 23.0% 23.5% 24.0% 24.5% 25.0% 25.5% 26.0% 26.5% 27.0%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 27.5% 28.0% 28.5% 29.0% 29.5% 30.0% 30.5% 31.0% 31.5% 32.0% 32.5%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 33.0% 33.5% 34.0% 34.5% 35.0% 35.5% 36.0% 36.5% 37.0% 37.5% 38.0%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 38.5% 39.0% 39.5% 40.0% 40.5% 41.0% 41.5% 42.0% 42.5% 43.0% 43.5%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 44.0% 44.5% 45.0% 45.5% 46.0% 46.5% 47.0% 47.5% 48.0% 48.5% 49.0%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 49.5% 50.0% 50.5% 51.0% 51.5% 52.0% 52.5% 53.0% 53.5% 54.0% 54.5%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 55.0% 55.5% 56.0% 56.5% 57.0% 57.5% 58.0% 58.5% 59.0% 59.5% 60.0%
## 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
## 60.5% 61.0% 61.5% 62.0% 62.5% 63.0% 63.5% 64.0% 64.5% 65.0% 65.5%
## 0.000 0.000 0.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## 66.0% 66.5% 67.0% 67.5% 68.0% 68.5% 69.0% 69.5% 70.0% 70.5% 71.0%
## 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 2.000 2.000 2.000
## 71.5% 72.0% 72.5% 73.0% 73.5% 74.0% 74.5% 75.0% 75.5% 76.0% 76.5%
## 2.000 2.000 2.000 2.000 2.000 2.000 3.000 3.000 3.000 3.000 3.000
## 77.0% 77.5% 78.0% 78.5% 79.0% 79.5% 80.0% 80.5% 81.0% 81.5% 82.0%
## 3.000 3.000 4.000 4.000 4.000 4.000 4.000 4.000 4.000 5.000 5.000
## 82.5% 83.0% 83.5% 84.0% 84.5% 85.0% 85.5% 86.0% 86.5% 87.0% 87.5%
## 5.000 5.000 6.000 6.000 6.000 6.000 6.000 6.000 6.000 7.000 7.000
## 88.0% 88.5% 89.0% 89.5% 90.0% 90.5% 91.0% 91.5% 92.0% 92.5% 93.0%
## 7.000 8.000 8.000 9.000 9.000 9.000 9.000 10.000 10.000 10.000 10.000
## 93.5% 94.0% 94.5% 95.0% 95.5% 96.0% 96.5% 97.0% 97.5% 98.0% 98.5%
## 11.000 11.940 12.000 13.000 13.000 14.000 15.465 16.000 18.000 20.000 21.000
## 99.0% 99.5% 100.0%
## 21.000 21.495 30.000
x <- subset(data$dh_cant_tras_d, data$dh_cant_tras_d <= 31)
hist(x)
dh_cant_tras_d <- data$dh_cant_tras_d
names(data)[109] # dh_max_dia_comisio_d
## [1] "dh_max_dia_comisio_d"
names(data)[110] # cpc_avg_saldo: Promedio del saldo de las obligaciones del cliente en el mes anterior
## [1] "cpc_avg_saldo"
summary(data$cpc_avg_saldo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -70455 0 16975 4949118 5029954 103158078
hist(data$cpc_avg_saldo)
quantile(data$cpc_avg_saldo, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## -70455.00 -65826.00 -19628.29 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 36% 37% 38% 39% 40% 41%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 42% 43% 44% 45% 46% 47%
## 0.00 0.00 0.00 0.00 14169.66 16975.00
## 48% 49% 50% 51% 52% 53%
## 16975.00 16975.00 16975.00 118939.70 305999.12 433154.00
## 54% 55% 56% 57% 58% 59%
## 601811.48 796185.15 993033.54 1175050.00 1302303.00 1493580.88
## 60% 61% 62% 63% 64% 65%
## 1694424.33 1897335.85 2052581.75 2283110.98 2580505.34 2724391.55
## 66% 67% 68% 69% 70% 71%
## 2921941.00 3211054.84 3662326.44 3909918.00 4267080.33 4304110.50
## 72% 73% 74% 75% 76% 77%
## 4535613.02 4567161.00 4864807.92 5029954.00 5649048.00 6121840.71
## 78% 79% 80% 81% 82% 83%
## 6553668.23 6977784.50 7230429.80 7465720.25 7623077.00 7910066.74
## 84% 85% 86% 87% 88% 89%
## 8316979.75 9229880.50 10935486.07 11550431.50 12813655.74 13193938.00
## 90% 91% 92% 93% 94% 95%
## 13839320.64 14111779.62 16685620.00 17892250.14 22239378.76 24469808.29
## 96% 97% 98% 99% 100%
## 26367451.45 28919972.77 42970063.02 55008435.80 103158078.10
x <- subset(data$cpc_avg_saldo, data$cpc_avg_saldo <= 106884393.50)
hist(x)
cpc_avg_saldo <- data$cpc_avg_saldo
names(data)[111] # dc_max_saldo_sf: máximo saldo en el sistema financiero (Solo Financiero)
## [1] "dc_max_saldo_sf"
summary(data$dc_max_saldo_sf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 167955 0 69454000
names(data)[112] # dh_val_pago_tarj_d: Valor total de las trasacciones de salida por concepto de pagos de tdc en un mes
## [1] "dh_val_pago_tarj_d"
summary(data$dh_val_pago_tarj_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 555679 61884 10050161
hist(data$dh_val_pago_tarj_d)
quantile(data$dh_val_pago_tarj_d, seq(0,1,by=0.01))
## 0% 1% 2% 3% 4% 5%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 6% 7% 8% 9% 10% 11%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 12% 13% 14% 15% 16% 17%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 18% 19% 20% 21% 22% 23%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 24% 25% 26% 27% 28% 29%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 30% 31% 32% 33% 34% 35%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 36% 37% 38% 39% 40% 41%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 42% 43% 44% 45% 46% 47%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 48% 49% 50% 51% 52% 53%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 54% 55% 56% 57% 58% 59%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 60% 61% 62% 63% 64% 65%
## 0.00 0.00 0.00 0.00 0.00 0.00
## 66% 67% 68% 69% 70% 71%
## 0.00 0.00 12200.00 20000.00 31920.00 31920.00
## 72% 73% 74% 75% 76% 77%
## 33950.00 33950.00 33950.00 61884.25 127762.36 189655.46
## 78% 79% 80% 81% 82% 83%
## 238181.48 453527.86 524195.40 602165.00 803614.04 954853.36
## 84% 85% 86% 87% 88% 89%
## 1107856.68 1183850.15 1401638.64 1483828.00 1584879.52 1677979.52
## 90% 91% 92% 93% 94% 95%
## 1727225.00 1954311.91 2315173.44 2523524.00 3057532.00 3447819.00
## 96% 97% 98% 99% 100%
## 4709415.00 5000000.00 5132114.72 7882268.24 10050161.00
x <- subset(data$dh_val_pago_tarj_d, data$dh_val_pago_tarj_d <= 10560315.0)
hist(x)
dh_val_pago_tarj_d <- data$dh_val_pago_tarj_d
names(data)[113] # pc_productos_no_rotativos_entidad: Cantidad de productos no rotativos en el banco
## [1] "pc_productos_no_rotativos_entidad"
table(data$pc_productos_no_rotativos_entidad)
##
## 0 1
## 901 1
hist(data$pc_productos_no_rotativos_entidad)
names(data)[114] # pc_saldo_no_rot_ent: Obligaciones no rotativos en el banco
## [1] "pc_saldo_no_rot_ent"
summary(data$pc_saldo_no_rot_ent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 40895 0 36887000
names(data)[115] # pc_vi_no_rotativos_entidad: Obligaciones no rotativos en el banco
## [1] "pc_vi_no_rotativos_entidad"
summary(data$pc_vi_no_rotativos_entidad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 49.89 0.00 45000.00
names(data)[116] # dh_min_dia_entradas
## [1] "dh_min_dia_entradas"
names(data)[117] # pcons_hipotecario_vivienda: Es el producto un hipotecario
## [1] "pcons_hipotecario_vivienda"
table(data$pcons_hipotecario_vivienda)
##
## 0 1
## 879 23
pcons_hipotecario_vivienda <- data$pcons_hipotecario_vivienda
names(data)[118] # gsm_mejor_gestion_3m: Mejor gestión realizada en los últimos 3 meses
## [1] "gsm_mejor_gestion_3m"
table(data$gsm_mejor_gestion_3m)
##
## 0 16
## 901 1
names(data)[119] # dh_avg_dia_salidas: Dia promedio del mes en el que hace las salidas de dinero
## [1] "dh_avg_dia_salidas"
names(data)[120] # y_auto_cura: Variable respuesta: si el cliente se autocuro o no
## [1] "llave"
table(data$y_auto_cura)
## < table of extent 0 >
Y <- data$y_auto_cura
names(data)[122] # segmentoestructural: Segmento estructural
## [1] "subsegmentoestructural"
table(data$segmentoestructural)
##
## PYME
## 902
names(data)[123] # subsegmentoestructural
## [1] "anhomes_ciclo"
table(data$subsegmentoestructural)
##
## Pyme Pequena
## 902
KEY <- data$llave
length(KEY)
## [1] 902
# Unificar variables
unificado <- as.data.frame(
cbind(
max_sem,
desv_sem,
prom_bim,
max_mes_anterior,
prom_mes_anterior,
prom_sem,
max_bim,
prom_trim,
pc_cant_moras_30_ult_12_meses,
desv_trim,
desv_bim,
dh_cant_entradas,
pc_transaccional,
dh_val_entradas,
pcons_tarjeta_de_credito,
dh_avg_dia_retiros_d,
dmi_max_egreso_diario,
dh_cant_otros_d,
dmi_max_ingreso_diario,
dh_val_otros_d,
pc_ingreso_final,
dh_cant_pagos_d,
dmi_ingreso_total_mes,
dh_val_pagos_d,
pc_gasto_familiar,
pc_cuotas_pagadas,
pc_ingreso_rutina_con_techo,
dh_cant_salidas,
dh_min_dia_pagos_d,
pc_ingreso_por_rutina,
dmi_egreso_total_mes,
dh_val_salidas,
cpc_avg_nro_cuota,
cpc_sum_saldo,
cpc_saldo_sobre_ing,
dh_min_dia_pago_cred_d,
cpc_saldo_tdc,
dh_cant_tras_d,
cpc_avg_saldo,
dh_val_pago_tarj_d,
pcons_hipotecario_vivienda,
flag_mora60_ult12meses,
flag_ultima_entrada_1quincena,
flag_tuvo_mora60_ult3meses,
flag_diaPago_1quincena,
flag_es_cluster_6,
flag_tuvo_mora90_ult12M,
flax_maxDiaOtrosD_1quincena,
flag_maxDiaPagos_d_1quincena,
flag_encontrado_cifin
)
)
dim(unificado)
## [1] 902 50
names(unificado)
## [1] "max_sem" "desv_sem"
## [3] "prom_bim" "max_mes_anterior"
## [5] "prom_mes_anterior" "prom_sem"
## [7] "max_bim" "prom_trim"
## [9] "pc_cant_moras_30_ult_12_meses" "desv_trim"
## [11] "desv_bim" "dh_cant_entradas"
## [13] "pc_transaccional" "dh_val_entradas"
## [15] "pcons_tarjeta_de_credito" "dh_avg_dia_retiros_d"
## [17] "dmi_max_egreso_diario" "dh_cant_otros_d"
## [19] "dmi_max_ingreso_diario" "dh_val_otros_d"
## [21] "pc_ingreso_final" "dh_cant_pagos_d"
## [23] "dmi_ingreso_total_mes" "dh_val_pagos_d"
## [25] "pc_gasto_familiar" "pc_cuotas_pagadas"
## [27] "pc_ingreso_rutina_con_techo" "dh_cant_salidas"
## [29] "dh_min_dia_pagos_d" "pc_ingreso_por_rutina"
## [31] "dmi_egreso_total_mes" "dh_val_salidas"
## [33] "cpc_avg_nro_cuota" "cpc_sum_saldo"
## [35] "cpc_saldo_sobre_ing" "dh_min_dia_pago_cred_d"
## [37] "cpc_saldo_tdc" "dh_cant_tras_d"
## [39] "cpc_avg_saldo" "dh_val_pago_tarj_d"
## [41] "pcons_hipotecario_vivienda" "flag_mora60_ult12meses"
## [43] "flag_ultima_entrada_1quincena" "flag_tuvo_mora60_ult3meses"
## [45] "flag_diaPago_1quincena" "flag_es_cluster_6"
## [47] "flag_tuvo_mora90_ult12M" "flax_maxDiaOtrosD_1quincena"
## [49] "flag_maxDiaPagos_d_1quincena" "flag_encontrado_cifin"
nrow(unificado)
## [1] 902
length(KEY)
## [1] 902
# Dejar solamente las variables que necesito
unificado <- unificado[, which(names(unificado) %in% c(
'desv_sem',
'max_mes_anterior',
'desv_trim',
'dh_cant_entradas',
'pcons_tarjeta_de_credito',
'dh_avg_dia_retiros_d',
'dh_cant_otros_d',
'dmi_max_ingreso_diario',
'dh_val_otros_d',
'dh_cant_pagos_d',
'dh_val_pagos_d',
'pc_cuotas_pagadas',
'dh_min_dia_pagos_d',
'pc_ingreso_por_rutina',
'cpc_saldo_sobre_ing',
'dh_min_dia_pago_cred_d',
'cpc_saldo_tdc',
'dh_cant_tras_d',
'cpc_avg_saldo',
'dh_val_pago_tarj_d',
'flag_tuvo_mora60_ult3meses',
'flag_diaPago_1quincena',
'flag_es_cluster_6',
'flag_tuvo_mora90_ult12M',
'flax_maxDiaOtrosD_1quincena',
'flag_encontrado_cifin'
))]
dim(unificado)
## [1] 902 26
names(unificado)
## [1] "desv_sem" "max_mes_anterior"
## [3] "desv_trim" "dh_cant_entradas"
## [5] "pcons_tarjeta_de_credito" "dh_avg_dia_retiros_d"
## [7] "dh_cant_otros_d" "dmi_max_ingreso_diario"
## [9] "dh_val_otros_d" "dh_cant_pagos_d"
## [11] "dh_val_pagos_d" "pc_cuotas_pagadas"
## [13] "dh_min_dia_pagos_d" "pc_ingreso_por_rutina"
## [15] "cpc_saldo_sobre_ing" "dh_min_dia_pago_cred_d"
## [17] "cpc_saldo_tdc" "dh_cant_tras_d"
## [19] "cpc_avg_saldo" "dh_val_pago_tarj_d"
## [21] "flag_tuvo_mora60_ult3meses" "flag_diaPago_1quincena"
## [23] "flag_es_cluster_6" "flag_tuvo_mora90_ult12M"
## [25] "flax_maxDiaOtrosD_1quincena" "flag_encontrado_cifin"
# dejar solo variables independientes
KEY
## [1] 1 3 4 6 7 8 10 11 13 14 16 18 19 23 24
## [16] 25 28 30 32 33 39 40 42 43 46 48 50 51 55 56
## [31] 57 58 59 60 61 63 65 66 67 70 75 78 87 88 89
## [46] 93 94 95 96 97 99 100 101 103 104 105 106 107 108 109
## [61] 111 114 116 119 120 121 122 124 125 126 127 129 130 131 132
## [76] 133 134 136 138 141 142 143 144 145 147 148 149 152 153 154
## [91] 155 157 160 163 164 165 169 170 171 177 182 184 185 187 189
## [106] 190 192 195 206 210 211 215 218 219 224 225 226 227 229 231
## [121] 234 235 237 238 241 242 243 245 246 249 250 251 252 254 259
## [136] 260 261 262 265 266 267 268 271 278 280 282 291 293 295 299
## [151] 302 304 305 306 312 313 314 319 321 322 324 326 329 331 334
## [166] 338 342 344 345 348 350 357 359 362 364 365 366 369 371 372
## [181] 373 374 378 383 384 385 386 387 390 391 395 398 400 404 407
## [196] 411 412 413 414 415 420 422 423 424 425 428 433 435 436 437
## [211] 438 440 444 450 452 456 461 462 465 467 468 476 479 482 483
## [226] 484 485 487 488 489 490 492 493 495 497 498 500 504 505 515
## [241] 516 517 530 533 534 537 539 540 541 543 546 547 549 551 552
## [256] 553 554 556 557 558 559 562 563 564 568 572 574 576 577 578
## [271] 581 583 585 586 590 592 593 601 606 607 609 613 614 615 616
## [286] 619 620 621 622 628 629 632 633 635 636 638 640 642 643 645
## [301] 649 650 653 654 657 661 662 664 668 669 671 673 675 676 677
## [316] 680 681 683 685 687 688 689 690 691 694 695 696 704 708 709
## [331] 715 729 730 732 733 734 738 739 746 749 750 757 761 764 765
## [346] 775 781 784 785 787 788 790 792 794 795 798 813 815 816 817
## [361] 818 819 820 821 822 824 826 827 830 831 836 837 838 840 841
## [376] 843 844 849 852 853 854 855 858 865 873 876 885 886 887 893
## [391] 894 901 902 906 907 912 913 914 915 920 921 924 925 926 927
## [406] 931 934 939 941 943 944 949 950 951 954 956 961 962 966 967
## [421] 968 970 973 974 975 977 980 984 986 988 991 993 994 995 998
## [436] 1000 1008 1009 1011 1012 1013 1014 1020 1021 1022 1024 1028 1030 1031 1033
## [451] 1035 1036 1044 1049 1052 1059 1061 1067 1068 1071 1073 1075 1076 1077 1080
## [466] 1082 1083 1085 1086 1088 1092 1093 1095 1096 1098 1099 1107 1108 1110 1111
## [481] 1112 1118 1120 1125 1127 1129 1132 1133 1134 1135 1137 1139 1140 1142 1144
## [496] 1145 1147 1150 1151 1154 1155 1156 1157 1158 1159 1168 1172 1176 1177 1179
## [511] 1182 1185 1186 1193 1195 1196 1199 1205 1206 1207 1210 1213 1215 1216 1218
## [526] 1219 1221 1223 1226 1227 1229 1235 1237 1240 1242 1243 1246 1247 1249 1250
## [541] 1254 1255 1256 1257 1261 1265 1266 1268 1269 1271 1272 1274 1275 1276 1281
## [556] 1292 1304 1313 1314 1315 1316 1317 1320 1321 1322 1323 1324 1328 1330 1331
## [571] 1332 1333 1334 1335 1339 1340 1342 1345 1347 1353 1355 1357 1361 1362 1369
## [586] 1371 1384 1389 1390 1391 1392 1393 1395 1398 1401 1406 1407 1408 1411 1412
## [601] 1416 1420 1424 1426 1430 1435 1436 1437 1438 1439 1440 1442 1443 1444 1447
## [616] 1449 1452 1454 1455 1457 1459 1462 1464 1465 1466 1469 1471 1474 1476 1477
## [631] 1479 1481 1484 1486 1488 1491 1493 1500 1501 1505 1508 1509 1511 1513 1514
## [646] 1519 1522 1523 1524 1527 1528 1529 1530 1533 1535 1542 1543 1544 1545 1548
## [661] 1550 1551 1554 1555 1556 1557 1559 1561 1563 1564 1567 1569 1572 1573 1574
## [676] 1576 1577 1578 1579 1580 1582 1583 1584 1587 1588 1589 1591 1592 1594 1597
## [691] 1599 1601 1602 1605 1606 1607 1609 1610 1611 1613 1615 1618 1619 1620 1622
## [706] 1625 1628 1630 1634 1635 1637 1638 1641 1643 1645 1646 1648 1649 1650 1656
## [721] 1660 1663 1664 1667 1669 1671 1673 1675 1679 1680 1683 1684 1687 1689 1690
## [736] 1693 1696 1699 1700 1704 1707 1708 1710 1711 1715 1716 1717 1718 1721 1722
## [751] 1725 1728 1729 1730 1731 1732 1733 1734 1735 1737 1738 1742 1744 1747 1749
## [766] 1753 1756 1758 1759 1761 1762 1765 1768 1769 1770 1771 1772 1777 1779 1780
## [781] 1781 1783 1784 1786 1787 1789 1792 1794 1795 1797 1802 1803 1806 1808 1809
## [796] 1810 1811 1812 1813 1820 1821 1822 1824 1825 1829 1831 1833 1834 1837 1840
## [811] 1847 1850 1852 1854 1864 1865 1866 1867 1871 1872 1873 1880 1883 1884 1885
## [826] 1889 1895 1896 1898 1899 1904 1905 1906 1907 1908 1909 1910 1911 1913 1916
## [841] 1918 1921 1922 1924 1925 1928 1929 1933 1935 1936 1938 1939 1941 1942 1943
## [856] 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1959 1964
## [871] 1967 1968 1969 1970 1974 1976 1977 1978 1984 1987 1988 1989 1990 1991 1992
## [886] 1993 1994 1997 1998 2000 2005 2007 2010 2012 2013 2014 2017 2019 2021 2025
## [901] 2027 2028
length(KEY)
## [1] 902
X <- unificado
X <- predict(preprocesado, X) # Aplicar Scaling
head(X)
## desv_sem max_mes_anterior desv_trim dh_cant_entradas
## 1 -0.25529797 -0.50752897 -1.1941103 -1.06028059
## 2 0.04710176 0.65178349 0.7213348 -1.06028059
## 3 0.01993882 -0.03016501 0.3804477 -1.06028059
## 4 0.65106628 1.94748564 0.8481227 0.07513835
## 5 0.47165449 0.71997834 0.7897396 -0.45337364
## 6 0.73660743 -0.50752897 0.8408677 -0.20198454
## pcons_tarjeta_de_credito dh_avg_dia_retiros_d dh_cant_otros_d
## 1 -0.6752024 -1.2211367 -0.8645237
## 2 -0.6752024 -1.2211367 -0.8645237
## 3 -0.6752024 -1.2211367 -0.8645237
## 4 -0.6752024 0.5306734 0.8224426
## 5 -0.6752024 0.7530771 -0.8645237
## 6 -0.6752024 0.3631089 -0.1100894
## dmi_max_ingreso_diario dh_val_otros_d dh_cant_pagos_d dh_val_pagos_d
## 1 -0.5939123 -0.4234230 -0.77779963 -0.5468244
## 2 -0.5939123 -0.4234230 -0.77779963 -0.5468244
## 3 -0.5939123 -0.4234230 -0.77779963 -0.5468244
## 4 -0.3281884 -0.3711720 -0.45552275 0.5283690
## 5 -0.5179912 -0.4234230 -0.13324587 -0.1850389
## 6 -0.2744742 -0.4150344 0.02789256 0.2742168
## pc_cuotas_pagadas dh_min_dia_pagos_d pc_ingreso_por_rutina
## 1 -0.6194104 -0.6551503 -0.6333276
## 2 -0.6194104 -0.6551503 -0.6333276
## 3 -0.6194104 -0.6551503 -0.3666269
## 4 -0.6194104 0.5026302 -0.6248345
## 5 -0.6194104 -0.3243559 -0.6248345
## 6 -0.6194104 -0.4897531 -0.4254765
## cpc_saldo_sobre_ing dh_min_dia_pago_cred_d cpc_saldo_tdc dh_cant_tras_d
## 1 -0.3699031 -0.6821349 -0.6288475 -0.4777662
## 2 -0.3699031 -0.6821349 -0.6288475 -0.4777662
## 3 -0.3699031 -0.6821349 -0.6288475 -0.4777662
## 4 -0.3699031 0.2916044 -0.6288475 -0.4777662
## 5 -0.3699031 -0.4039237 -0.6288475 -0.4777662
## 6 -0.3699031 -0.5430293 -0.6288475 -0.4777662
## cpc_avg_saldo dh_val_pago_tarj_d flag_tuvo_mora60_ult3meses
## 1 -0.4360925 -0.3932686 -0.3840753
## 2 -0.4360925 -0.3932686 -0.3840753
## 3 -0.4360925 -0.3932686 -0.3840753
## 4 -0.4360925 -0.3932686 -0.3840753
## 5 -0.4360925 -0.3932686 -0.3840753
## 6 -0.4360925 -0.3932686 -0.3840753
## flag_diaPago_1quincena flag_es_cluster_6 flag_tuvo_mora90_ult12M
## 1 0.6415816 -1.008661 -0.2625406
## 2 0.6415816 -1.008661 -0.2625406
## 3 0.6415816 -1.008661 -0.2625406
## 4 0.6415816 -1.008661 -0.2625406
## 5 0.6415816 -1.008661 -0.2625406
## 6 0.6415816 -1.008661 -0.2625406
## flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
## 1 0.881491 -1.4521007
## 2 0.881491 -1.4521007
## 3 0.881491 0.6886098
## 4 -1.134363 0.6886098
## 5 0.881491 0.6886098
## 6 0.881491 0.6886098
OOT <- cbind(KEY, X)
head(OOT)
## KEY desv_sem max_mes_anterior desv_trim dh_cant_entradas
## 1 1 -0.25529797 -0.50752897 -1.1941103 -1.06028059
## 2 3 0.04710176 0.65178349 0.7213348 -1.06028059
## 3 4 0.01993882 -0.03016501 0.3804477 -1.06028059
## 4 6 0.65106628 1.94748564 0.8481227 0.07513835
## 5 7 0.47165449 0.71997834 0.7897396 -0.45337364
## 6 8 0.73660743 -0.50752897 0.8408677 -0.20198454
## pcons_tarjeta_de_credito dh_avg_dia_retiros_d dh_cant_otros_d
## 1 -0.6752024 -1.2211367 -0.8645237
## 2 -0.6752024 -1.2211367 -0.8645237
## 3 -0.6752024 -1.2211367 -0.8645237
## 4 -0.6752024 0.5306734 0.8224426
## 5 -0.6752024 0.7530771 -0.8645237
## 6 -0.6752024 0.3631089 -0.1100894
## dmi_max_ingreso_diario dh_val_otros_d dh_cant_pagos_d dh_val_pagos_d
## 1 -0.5939123 -0.4234230 -0.77779963 -0.5468244
## 2 -0.5939123 -0.4234230 -0.77779963 -0.5468244
## 3 -0.5939123 -0.4234230 -0.77779963 -0.5468244
## 4 -0.3281884 -0.3711720 -0.45552275 0.5283690
## 5 -0.5179912 -0.4234230 -0.13324587 -0.1850389
## 6 -0.2744742 -0.4150344 0.02789256 0.2742168
## pc_cuotas_pagadas dh_min_dia_pagos_d pc_ingreso_por_rutina
## 1 -0.6194104 -0.6551503 -0.6333276
## 2 -0.6194104 -0.6551503 -0.6333276
## 3 -0.6194104 -0.6551503 -0.3666269
## 4 -0.6194104 0.5026302 -0.6248345
## 5 -0.6194104 -0.3243559 -0.6248345
## 6 -0.6194104 -0.4897531 -0.4254765
## cpc_saldo_sobre_ing dh_min_dia_pago_cred_d cpc_saldo_tdc dh_cant_tras_d
## 1 -0.3699031 -0.6821349 -0.6288475 -0.4777662
## 2 -0.3699031 -0.6821349 -0.6288475 -0.4777662
## 3 -0.3699031 -0.6821349 -0.6288475 -0.4777662
## 4 -0.3699031 0.2916044 -0.6288475 -0.4777662
## 5 -0.3699031 -0.4039237 -0.6288475 -0.4777662
## 6 -0.3699031 -0.5430293 -0.6288475 -0.4777662
## cpc_avg_saldo dh_val_pago_tarj_d flag_tuvo_mora60_ult3meses
## 1 -0.4360925 -0.3932686 -0.3840753
## 2 -0.4360925 -0.3932686 -0.3840753
## 3 -0.4360925 -0.3932686 -0.3840753
## 4 -0.4360925 -0.3932686 -0.3840753
## 5 -0.4360925 -0.3932686 -0.3840753
## 6 -0.4360925 -0.3932686 -0.3840753
## flag_diaPago_1quincena flag_es_cluster_6 flag_tuvo_mora90_ult12M
## 1 0.6415816 -1.008661 -0.2625406
## 2 0.6415816 -1.008661 -0.2625406
## 3 0.6415816 -1.008661 -0.2625406
## 4 0.6415816 -1.008661 -0.2625406
## 5 0.6415816 -1.008661 -0.2625406
## 6 0.6415816 -1.008661 -0.2625406
## flax_maxDiaOtrosD_1quincena flag_encontrado_cifin
## 1 0.881491 -1.4521007
## 2 0.881491 -1.4521007
## 3 0.881491 0.6886098
## 4 -1.134363 0.6886098
## 5 0.881491 0.6886098
## 6 0.881491 0.6886098
# Probabilidad dataset OOT
KEY <- OOT$KEY
probabilidad <- as.numeric(predict(RF2, OOT, type = "prob")[,2])
hist(probabilidad)
respuesta <- ifelse(probabilidad >= 0.50, 1, 0)
table(respuesta)
## respuesta
## 0 1
## 379 523
llave <- KEY
imprimir <- as.data.frame(cbind(llave, probabilidad, respuesta))
head(imprimir)
## llave probabilidad respuesta
## 1 1 0.42 0
## 2 3 0.46 0
## 3 4 0.26 0
## 4 6 0.25 0
## 5 7 0.31 0
## 6 8 0.40 0
table(imprimir$respuesta)
##
## 0 1
## 379 523
# write.csv(imprimir,"Base_prueba_evaluado.csv")