##### ----------- ooo -- ooo --------- #######
### CASH -- BLOCK
### Credit Score Script
### Exploracion
### ooo -- ooo
rm(list = ls()) # Limpiar entorno
##### ----------- ooo -- ooo --------- #######
### Directorio
setwd("/cloud/project")
##### ----------- ooo -- ooo --------- #######
#### Librerias
#install.packages("VIM") # kNN imputation
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(cluster)
library(FactoMineR)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret)
## Loading required package: lattice
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(nnet)
library(xgboost)
library(corrplot)
## corrplot 0.95 loaded
library(Information)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:Metrics':
##
## auc
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##### ----------- ooo -- ooo --------- #######
####Lectura
cred<-read.csv("BASE.csv", header = TRUE, row.names = NULL, sep=";", na.strings=-999.)
cred[1:2,]
## number dtscore dtscore_aux dtscoreco dtscoreco_aux edad IBC labor
## 1 1 668 0 NA 1 32 7.1 INDEPENDIENTE
## 2 2 737 0 668 0 32 6.6 EMPLEADO
## calificacion ingresos pendeuda co credito yeardde tinteres ncuotas genero
## 1 C1 2300000 0.02 0 16929000 2022 0.018 62 HOMBRE
## 2 B1 2417000 0.06 1 14562200 2022 0.018 63 HOMBRE
## cuota capacidad antiguedad_laboral estrato tipo_vivienda
## 1 421350 0.18 8 3 propia
## 2 362442 0.15 6 2 propia_pagando
## nivel_educativo creditos_activos personas_hogar eactual
## 1 tecnica/tecnologica 2 3 0
## 2 superior 1 4 0
dim(cred)
## [1] 835 26
names(cred)
## [1] "number" "dtscore" "dtscore_aux"
## [4] "dtscoreco" "dtscoreco_aux" "edad"
## [7] "IBC" "labor" "calificacion"
## [10] "ingresos" "pendeuda" "co"
## [13] "credito" "yeardde" "tinteres"
## [16] "ncuotas" "genero" "cuota"
## [19] "capacidad" "antiguedad_laboral" "estrato"
## [22] "tipo_vivienda" "nivel_educativo" "creditos_activos"
## [25] "personas_hogar" "eactual"
mean(cred$eactual)
## [1] 0.1473054
##### ----------- ooo -- ooo --------- #######
####Categoricas
names(table(cred$labor))#La variable “labor" tien errores de digitación en el registro de las categorias
## [1] "EMPLEADO" "EMPLEADO " "INDEPENDIENTE" "INDEPENDIETE"
cred$labor[cred$labor== "EMPLEADO "] <- "EMPLEADO"
cred$labor[cred$labor== "INDEPENDIETE"] <- "INDEPENDIENTE"
names(table(cred$calificacion))#Calificacion es correcta
## [1] "B1" "C1"
cred$co[cred$co== "NO "] <- "0"
cred$co[cred$co== "NO"] <- "0"
cred$co[cred$co== "X"] <- "1"
cred$co[cred$co== "SI"] <- "1"
names(table(cred$co))#Nuevamente encontramos errores de digitación, ademas sabemos que la X implica que si existe la garantia
## [1] "0" "1"
names(table(cred$yeardde))#varible con categorias correctas
## [1] "2022" "2023" "2024" "2025"
names(table(cred$genero))#Nuevamente encontramos errores de digitación
## [1] " HOMBRE" " MUJER" "HOMBRE" "HOMBRE " "MUJER" "MUJER "
cred$genero[cred$genero== "HOMBRE"] <- "MASCULINO"
cred$genero[cred$genero== "HOMBRE "] <- "MASCULINO"
cred$genero[cred$genero== " HOMBRE"] <- "MASCULINO"
cred$genero[cred$genero== "MUJER"] <- "MUJER"
cred$genero[cred$genero== " MUJER"] <- "MUJER"
cred$genero[cred$genero== "MUJER "] <- "MUJER"
names(table(cred$tipo_vivienda))#varible con categorias correctas
## [1] "arriendo" "propia" "propia_pagando"
names(table(cred$nivel_educativo))#varible con categorias correctas
## [1] "basica" "media" "superior"
## [4] "tecnica/tecnologica"
names(table(cred$eactual))#eactual esta correctamente categorizada, sin embargo para la construccion de modelos reduciremos las categorias a 1 (impago) y 0 (pago)
## [1] "0" "1"
#para evitar incluir eventos situacionales en el comportamiento de impago se considerara impago partir de los 60 días
#categorizacion
cred$labor<-as.factor(cred$labor)
cred$calificacion<-as.factor(cred$calificacion)
cred$co<-as.factor(cred$co)
cred$yeardde<-as.factor(cred$yeardde)
cred$genero<-as.factor(cred$genero)
cred$tipo_vivienda<-as.factor(cred$tipo_vivienda)
cred$nivel_educativo<-as.factor(cred$nivel_educativo)
cred$eactual<-as.factor(cred$eactual)
cred$dtscore_aux<-as.factor(cred$dtscore_aux)
cred$dtscoreco_aux<-as.factor(cred$dtscoreco_aux)
summary(cred)
## number dtscore dtscore_aux dtscoreco dtscoreco_aux
## Min. : 1.0 Min. :276.0 0:744 Min. :150.0 0:318
## 1st Qu.:209.5 1st Qu.:694.8 1: 91 1st Qu.:702.0 1:517
## Median :418.0 Median :732.0 Median :762.0
## Mean :418.0 Mean :708.8 Mean :742.4
## 3rd Qu.:626.5 3rd Qu.:762.0 3rd Qu.:817.0
## Max. :835.0 Max. :883.0 Max. :909.0
## NA's :91 NA's :517
## edad IBC labor calificacion
## Min. :19.00 Min. : 3.300 EMPLEADO :714 B1:545
## 1st Qu.:24.00 1st Qu.: 5.300 INDEPENDIENTE:121 C1:290
## Median :29.00 Median : 5.900
## Mean :30.27 Mean : 6.186
## 3rd Qu.:34.00 3rd Qu.: 6.800
## Max. :69.00 Max. :18.800
##
## ingresos pendeuda co credito yeardde
## Min. : 320000 Min. :0.0000 0:534 Length:835 2022: 61
## 1st Qu.:1700000 1st Qu.:0.0000 1:301 Class :character 2023:154
## Median :2300000 Median :0.0800 Mode :character 2024:317
## Mean :2698410 Mean :0.1405 2025:303
## 3rd Qu.:3087500 3rd Qu.:0.2100
## Max. :7500000 Max. :2.7800
## NA's :41
## tinteres ncuotas genero cuota
## Min. :0.01600 Min. :12.00 MASCULINO:668 Min. : 126000
## 1st Qu.:0.01870 1st Qu.:36.00 MUJER :167 1st Qu.: 351088
## Median :0.02080 Median :48.00 Median : 422699
## Mean :0.02219 Mean :51.23 Mean : 435545
## 3rd Qu.:0.02500 3rd Qu.:63.00 3rd Qu.: 488142
## Max. :0.03100 Max. :72.00 Max. :2036384
##
## capacidad antiguedad_laboral estrato tipo_vivienda
## Min. :0.0200 Min. : 1.000 Min. :1.000 arriendo :254
## 1st Qu.:0.1300 1st Qu.: 2.000 1st Qu.:1.000 propia :204
## Median :0.1800 Median : 5.000 Median :2.000 propia_pagando:377
## Mean :0.1924 Mean : 6.611 Mean :2.109
## 3rd Qu.:0.2300 3rd Qu.: 9.000 3rd Qu.:3.000
## Max. :1.8900 Max. :35.000 Max. :4.000
## NA's :37
## nivel_educativo creditos_activos personas_hogar eactual
## basica :167 Min. :1.000 Min. :2.000 0:712
## media :279 1st Qu.:1.000 1st Qu.:3.000 1:123
## superior :163 Median :2.000 Median :3.000
## tecnica/tecnologica:226 Mean :1.758 Mean :3.216
## 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :3.000 Max. :5.000
##
##### ----------- ooo -- ooo --------- #######
####Escalares
#detección de valores atipicos en variable "creddtscore"
min(cred$dtscore, na.rm = TRUE)
## [1] 276
max(cred$dtscore, na.rm = TRUE)
## [1] 883
#todos los valores estan dentro de un rango normal
#detección de valores atipicos en variable "creddtscoreco"
min(cred$dtscoreco, na.rm = TRUE)
## [1] 150
max(cred$dtscoreco, na.rm = TRUE)
## [1] 909
cred<- cred[-(which(cred$dtscoreco == 909)),]
#existe un score de 909 cuando el valor maximo es de 900
#detección de valores atipicos en variable "creddtscoreco"
min(cred$edad, na.rm = TRUE)
## [1] 19
max(cred$edad, na.rm = TRUE)
## [1] 69
#no hay valores fuera de un rango normal para la varaible
#deteccion de atipicos para IBC (primero la paso a numerica)
cred$IBC <- as.numeric(cred$IBC)
min(cred$IBC, na.rm = TRUE)
## [1] 3.3
max(cred$IBC, na.rm = TRUE)
## [1] 18.8
cred<- cred[-(which(cred$IBC == 18.8)),]
#existe un al menos 1 valor atipico
#detección de valores atipicos en variable "antiguedad_laboral"
min(cred$antiguedad_laboral, na.rm = TRUE)
## [1] 1
max(cred$antiguedad_laboral, na.rm = TRUE)
## [1] 35
#todos los valores estan dentro de un rango normal
#detección de valores atipicos en variable "ncuotas"
min(cred$ncuotas, na.rm = TRUE)
## [1] 12
max(cred$ncuotas, na.rm = TRUE)
## [1] 72
#todos los valores estan dentro de un rango normal
#detección de valores atipicos en variable "estrato"
min(cred$estrato, na.rm = TRUE)
## [1] 1
max(cred$estrato, na.rm = TRUE)
## [1] 4
#todos los valores estan dentro de un rango normal
#detección de valores atipicos en variable "ingresos"
bp<-boxplot(cred$ingresos)

bp#Es posible que existan valores atipicos pero no se evidencia claramente
## $stats
## [,1]
## [1,] 320000
## [2,] 1700000
## [3,] 2300000
## [4,] 3100000
## [5,] 5200000
##
## $n
## [1] 791
##
## $conf
## [,1]
## [1,] 2221350
## [2,] 2378650
##
## $out
## [1] 6700000 5300000 7000000 7100000 7300000 7200000 6600000 6500000 7200000
## [10] 5700000 7000000 5500000 5600000 7300000 7500000 7400000 6600000 5492000
## [19] 6900000 5900000 6800000 6700000 6500000 7000000 5600000 5500000 6500000
## [28] 5500000 6300000 5800000 6000000 6300000 6700000 6600000 7100000 7100000
## [37] 6000000 7500000 6000000 7000000 5209970 7100000 5500000 6500000 6800000
## [46] 7000000 5400000 7000000 7100000 7100000 7000000 6800000 7200000 6500000
## [55] 5700000 7000000 5700000 7000000 7200000 7200000 7200000 6600000 5700000
## [64] 6500000
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] "1"
mediaingresos <- mean(cred$ingresos)
dsingresos<- sd(cred$ingresos)
liminferioringresos <- mediaingresos-(3*dsingresos)
limsuperioringresos <- mediaingresos+(3*dsingresos)
atipicosingresos <- subset(cred, (cred$ingresos< liminferioringresos | cred$ingresos > limsuperioringresos))
dim(atipicosingresos)#La regla empirica descarta la existencia de valores atipicos en la variable
## [1] 0 26
#detección de valores atipicos en variable "ingresospendeuda"
bp<-boxplot(cred$pendeuda)

bp#Es posible que existan valores atipicos
## $stats
## [,1]
## [1,] 0.00
## [2,] 0.00
## [3,] 0.08
## [4,] 0.21
## [5,] 0.47
##
## $n
## [1] 832
##
## $conf
## [,1]
## [1,] 0.0684969
## [2,] 0.0915031
##
## $out
## [1] 0.62 0.78 0.89 0.91 2.78 0.63 0.57 0.53 0.61 0.55 0.60 0.55 0.60 0.55 0.89
## [16] 0.55 0.61 0.78 0.62 0.60 0.60 0.89 0.63 0.63 2.78 0.61 0.55 0.53 0.91 0.53
## [31] 0.62 0.55 0.55 0.61 0.55 0.91
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] "1"
mediapendeuda <- mean(cred$pendeuda)
dspendeuda<- sd(cred$pendeuda)
liminferiorpendeuda<- mediapendeuda-(3*dspendeuda)
limsuperiorpendeuda<- mediapendeuda+(3*dspendeuda)
atipicospendeuda<- subset(cred, (cred$pendeuda< liminferiorpendeuda | cred$pendeuda > limsuperiorpendeuda))
dim(atipicospendeuda)#La regla empirica confirma la existencia de 10 valores atipicos en la variable
## [1] 10 26
#detección de valores atipicos en variable "credito"
cred$credito <- as.numeric(cred$credito)
## Warning: NAs introduced by coercion
bp<-boxplot(cred$credito)

bp#Es posible que existan valores atipicos
## $stats
## [,1]
## [1,] 4200000
## [2,] 10045000
## [3,] 11999500
## [4,] 14045000
## [5,] 20045000
##
## $n
## [1] 780
##
## $conf
## [,1]
## [1,] 11773208
## [2,] 12225792
##
## $out
## [1] 21045000 26045000 3803300 60045000 21045000 26045000 3803300 21045000
## [9] 60045000 26045000 26045000
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] "1"
cred$credito<- as.numeric(cred$credito)
mediacredito<- mean(cred$credito)
dscredito<- sd(cred$credito)
liminferiorcredito<- mediacredito-(3*dscredito)
limsuperiorcredito<- mediacredito+(3*dscredito)
atipicoscredito<- subset(cred, (cred$credito< liminferiorcredito | cred$credito > limsuperiorcredito))
dim(atipicoscredito)#no hay valores atipicos
## [1] 0 26
#detección de valores atipicos en variable "tinteres"
bp<-boxplot(cred$tinteres)

bp#no se evidencian valores atipicos se omite regla empirica
## $stats
## [,1]
## [1,] 0.0160
## [2,] 0.0187
## [3,] 0.0208
## [4,] 0.0250
## [5,] 0.0310
##
## $n
## [1] 832
##
## $conf
## [,1]
## [1,] 0.02045491
## [2,] 0.02114509
##
## $out
## numeric(0)
##
## $group
## numeric(0)
##
## $names
## [1] "1"
#detección de valores atipicos en variable "capacidad"
bp<-boxplot(cred$capacidad)

bp#es posible que existan valores atipicos
## $stats
## [,1]
## [1,] 0.02
## [2,] 0.13
## [3,] 0.18
## [4,] 0.23
## [5,] 0.38
##
## $n
## [1] 832
##
## $conf
## [,1]
## [1,] 0.1745223
## [2,] 0.1854777
##
## $out
## [1] 0.50 0.48 0.40 1.89 0.39 0.39 0.39 0.39 1.89 0.40 1.89 0.50 1.89 0.40 0.40
## [16] 0.48 0.48
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $names
## [1] "1"
mediacapacidad <- mean(cred$capacidad)
dscapacidad<- sd(cred$capacidad)
liminferiorcapacidad <- mediacapacidad-(3*dscapacidad)
limsuperiorcapacidad <- mediacapacidad+(3*dscapacidad)
atipicoscapacidad <- subset(cred, (cred$capacidad< liminferiorcapacidad | cred$capacidad > limsuperiorcapacidad))
dim(atipicoscapacidad)#se detectan 4 valores atipicos
## [1] 4 26
#detección de valores atipicos en variable "estrato"
bp<-boxplot(cred$estrato)

bp#no se evidencian valores atipicos se omite regla empirica
## $stats
## [,1]
## [1,] 1
## [2,] 1
## [3,] 2
## [4,] 3
## [5,] 4
##
## $n
## [1] 795
##
## $conf
## [,1]
## [1,] 1.887926
## [2,] 2.112074
##
## $out
## numeric(0)
##
## $group
## numeric(0)
##
## $names
## [1] "1"
#detección de valores atipicos en variable "creditos_activos"
bp<-boxplot(cred$creditos_activos)

bp#no se evidencian valores atipicos se omite regla empirica
## $stats
## [,1]
## [1,] 1
## [2,] 1
## [3,] 2
## [4,] 2
## [5,] 3
##
## $n
## [1] 832
##
## $conf
## [,1]
## [1,] 1.945223
## [2,] 2.054777
##
## $out
## numeric(0)
##
## $group
## numeric(0)
##
## $names
## [1] "1"
#detección de valores atipicos en variable "personas_hogar"
bp<-boxplot(cred$personas_hogar)

bp#no se evidencian valores atipicos se omite regla empirica
## $stats
## [,1]
## [1,] 2
## [2,] 3
## [3,] 3
## [4,] 4
## [5,] 5
##
## $n
## [1] 832
##
## $conf
## [,1]
## [1,] 2.945223
## [2,] 3.054777
##
## $out
## numeric(0)
##
## $group
## numeric(0)
##
## $names
## [1] "1"
cred$atipicospendeuda<-ifelse((cred$pendeuda< liminferiorpendeuda | cred$pendeuda > limsuperiorpendeuda),1,0)
cred$atipicoscapacidad<-ifelse((cred$capacidad < liminferiorcapacidad | cred$capacidad > limsuperiorcapacidad),1,0)
credlimpio <- subset(cred,(cred$atipicospendeuda==0 & cred$atipicoscapacidad==0))
dim(credlimpio)
## [1] 818 28
dim(cred)
## [1] 832 28
cred<-credlimpio
summary(cred)
## number dtscore dtscore_aux dtscoreco dtscoreco_aux
## Min. : 1.0 Min. :276.0 0:728 Min. :150.0 0:313
## 1st Qu.:212.2 1st Qu.:695.0 1: 90 1st Qu.:700.0 1:505
## Median :419.5 Median :732.0 Median :761.0
## Mean :419.4 Mean :710.1 Mean :741.1
## 3rd Qu.:627.8 3rd Qu.:762.0 3rd Qu.:817.0
## Max. :835.0 Max. :883.0 Max. :881.0
## NA's :90 NA's :505
## edad IBC labor calificacion
## Min. :19.00 Min. :3.300 EMPLEADO :702 B1:532
## 1st Qu.:24.00 1st Qu.:5.300 INDEPENDIENTE:116 C1:286
## Median :29.00 Median :5.900
## Mean :30.21 Mean :6.156
## 3rd Qu.:34.00 3rd Qu.:6.800
## Max. :69.00 Max. :9.000
##
## ingresos pendeuda co credito yeardde
## Min. : 660000 Min. :0.0000 0:521 Min. : 3803300 2022: 59
## 1st Qu.:1700000 1st Qu.:0.0000 1:297 1st Qu.:10045000 2023:149
## Median :2300000 Median :0.0750 Median :12150000 2024:308
## Mean :2683068 Mean :0.1273 Mean :12200356 2025:302
## 3rd Qu.:3050000 3rd Qu.:0.2100 3rd Qu.:14045000
## Max. :7500000 Max. :0.6300 Max. :60045000
## NA's :41 NA's :51
## tinteres ncuotas genero cuota
## Min. :0.01600 Min. :12.0 MASCULINO:656 Min. : 126000
## 1st Qu.:0.01870 1st Qu.:36.0 MUJER :162 1st Qu.: 353700
## Median :0.02080 Median :48.0 Median : 422699
## Mean :0.02215 Mean :51.3 Mean : 435790
## 3rd Qu.:0.02500 3rd Qu.:63.0 3rd Qu.: 488142
## Max. :0.03100 Max. :72.0 Max. :2036384
##
## capacidad antiguedad_laboral estrato tipo_vivienda
## Min. :0.0200 Min. : 1.000 Min. :1.000 arriendo :253
## 1st Qu.:0.1300 1st Qu.: 2.000 1st Qu.:1.000 propia :196
## Median :0.1800 Median : 5.000 Median :2.000 propia_pagando:369
## Mean :0.1851 Mean : 6.589 Mean :2.098
## 3rd Qu.:0.2300 3rd Qu.: 9.000 3rd Qu.:3.000
## Max. :0.5000 Max. :35.000 Max. :4.000
## NA's :35
## nivel_educativo creditos_activos personas_hogar eactual
## basica :165 Min. :1.000 Min. :2.000 0:695
## media :276 1st Qu.:1.000 1st Qu.:3.000 1:123
## superior :157 Median :2.000 Median :3.000
## tecnica/tecnologica:220 Mean :1.762 Mean :3.219
## 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :3.000 Max. :5.000
##
## atipicospendeuda atipicoscapacidad
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
##
## -- ### -- ### -------------
##-- Imputation : KNN algorithm
pred <- c("ingresos","estrato", "edad","dtscore", "cuota", "antiguedad_laboral")# seleccion de predictores
cred_se <- cred[, pred]
names(cred_se)
## [1] "ingresos" "estrato" "edad"
## [4] "dtscore" "cuota" "antiguedad_laboral"
summary(cred_se)
## ingresos estrato edad dtscore
## Min. : 660000 Min. :1.000 Min. :19.00 Min. :276.0
## 1st Qu.:1700000 1st Qu.:1.000 1st Qu.:24.00 1st Qu.:695.0
## Median :2300000 Median :2.000 Median :29.00 Median :732.0
## Mean :2683068 Mean :2.098 Mean :30.21 Mean :710.1
## 3rd Qu.:3050000 3rd Qu.:3.000 3rd Qu.:34.00 3rd Qu.:762.0
## Max. :7500000 Max. :4.000 Max. :69.00 Max. :883.0
## NA's :41 NA's :35 NA's :90
## cuota antiguedad_laboral
## Min. : 126000 Min. : 1.000
## 1st Qu.: 353700 1st Qu.: 2.000
## Median : 422699 Median : 5.000
## Mean : 435790 Mean : 6.589
## 3rd Qu.: 488142 3rd Qu.: 9.000
## Max. :2036384 Max. :35.000
##
# identificar N/A
uv <- c(which(is.na(cred_se$ingresos)), which(is.na(cred_se$estrato)), which(is.na(cred_se$dtscore)))
### ---- determinar numero de grupos
tem <- scale(cred_se[-uv,]) # estandarizar variables
dim(tem)
## [1] 661 6
### Matriz de distancias
cdistan<- dist(tem, method = "euclidean")## method=manhattan, sorensen, bray-curtis, gower
## estimar k optimos
fviz_nbclust(tem, kmeans, method = "wss", k.max=12) +
geom_vline(xintercept = 3, linetype = 2)

fviz_nbclust(tem, kmeans, method = "silhouette", k.max=12) +
geom_vline(xintercept = 3, linetype = 2)

#KNN estimacion
cred_iset <- kNN(cred_se, variable = "ingresos", k = 3, imp_var = FALSE)
## estrato edad dtscore cuota
## 1 19 276 126000
## antiguedad_laboral estrato edad dtscore
## 1 4 69 883
## cuota antiguedad_laboral
## 2036384 35
# Replace the original dtscore in cred with the imputed values
cred$ingresosimp <- cred_iset$ingresos
names(cred)
## [1] "number" "dtscore" "dtscore_aux"
## [4] "dtscoreco" "dtscoreco_aux" "edad"
## [7] "IBC" "labor" "calificacion"
## [10] "ingresos" "pendeuda" "co"
## [13] "credito" "yeardde" "tinteres"
## [16] "ncuotas" "genero" "cuota"
## [19] "capacidad" "antiguedad_laboral" "estrato"
## [22] "tipo_vivienda" "nivel_educativo" "creditos_activos"
## [25] "personas_hogar" "eactual" "atipicospendeuda"
## [28] "atipicoscapacidad" "ingresosimp"
summary(cred)
## number dtscore dtscore_aux dtscoreco dtscoreco_aux
## Min. : 1.0 Min. :276.0 0:728 Min. :150.0 0:313
## 1st Qu.:212.2 1st Qu.:695.0 1: 90 1st Qu.:700.0 1:505
## Median :419.5 Median :732.0 Median :761.0
## Mean :419.4 Mean :710.1 Mean :741.1
## 3rd Qu.:627.8 3rd Qu.:762.0 3rd Qu.:817.0
## Max. :835.0 Max. :883.0 Max. :881.0
## NA's :90 NA's :505
## edad IBC labor calificacion
## Min. :19.00 Min. :3.300 EMPLEADO :702 B1:532
## 1st Qu.:24.00 1st Qu.:5.300 INDEPENDIENTE:116 C1:286
## Median :29.00 Median :5.900
## Mean :30.21 Mean :6.156
## 3rd Qu.:34.00 3rd Qu.:6.800
## Max. :69.00 Max. :9.000
##
## ingresos pendeuda co credito yeardde
## Min. : 660000 Min. :0.0000 0:521 Min. : 3803300 2022: 59
## 1st Qu.:1700000 1st Qu.:0.0000 1:297 1st Qu.:10045000 2023:149
## Median :2300000 Median :0.0750 Median :12150000 2024:308
## Mean :2683068 Mean :0.1273 Mean :12200356 2025:302
## 3rd Qu.:3050000 3rd Qu.:0.2100 3rd Qu.:14045000
## Max. :7500000 Max. :0.6300 Max. :60045000
## NA's :41 NA's :51
## tinteres ncuotas genero cuota
## Min. :0.01600 Min. :12.0 MASCULINO:656 Min. : 126000
## 1st Qu.:0.01870 1st Qu.:36.0 MUJER :162 1st Qu.: 353700
## Median :0.02080 Median :48.0 Median : 422699
## Mean :0.02215 Mean :51.3 Mean : 435790
## 3rd Qu.:0.02500 3rd Qu.:63.0 3rd Qu.: 488142
## Max. :0.03100 Max. :72.0 Max. :2036384
##
## capacidad antiguedad_laboral estrato tipo_vivienda
## Min. :0.0200 Min. : 1.000 Min. :1.000 arriendo :253
## 1st Qu.:0.1300 1st Qu.: 2.000 1st Qu.:1.000 propia :196
## Median :0.1800 Median : 5.000 Median :2.000 propia_pagando:369
## Mean :0.1851 Mean : 6.589 Mean :2.098
## 3rd Qu.:0.2300 3rd Qu.: 9.000 3rd Qu.:3.000
## Max. :0.5000 Max. :35.000 Max. :4.000
## NA's :35
## nivel_educativo creditos_activos personas_hogar eactual
## basica :165 Min. :1.000 Min. :2.000 0:695
## media :276 1st Qu.:1.000 1st Qu.:3.000 1:123
## superior :157 Median :2.000 Median :3.000
## tecnica/tecnologica:220 Mean :1.762 Mean :3.219
## 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :3.000 Max. :5.000
##
## atipicospendeuda atipicoscapacidad ingresosimp
## Min. :0 Min. :0 Min. : 660000
## 1st Qu.:0 1st Qu.:0 1st Qu.:1700000
## Median :0 Median :0 Median :2300000
## Mean :0 Mean :0 Mean :2660288
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:3000000
## Max. :0 Max. :0 Max. :7500000
##
cred[,c(10,29)]# comparacion
## ingresos ingresosimp
## 1 2300000 2300000
## 2 2417000 2417000
## 3 1800000 1800000
## 4 1400000 1400000
## 6 4460000 4460000
## 7 2800000 2800000
## 8 3700000 3700000
## 9 1100000 1100000
## 10 1800000 1800000
## 11 6700000 6700000
## 12 1210000 1210000
## 13 1449000 1449000
## 14 2800000 2800000
## 15 3100000 3100000
## 16 1100000 1100000
## 17 4209970 4209970
## 18 2100000 2100000
## 19 1500000 1500000
## 20 1250000 1250000
## 21 1300000 1300000
## 22 5300000 5300000
## 24 2540000 2540000
## 25 1360000 1360000
## 26 1600000 1600000
## 27 1510000 1510000
## 28 2100000 2100000
## 29 1726000 1726000
## 30 2850000 2850000
## 31 1800000 1800000
## 32 1350000 1350000
## 33 2000000 2000000
## 34 2278393 2278393
## 35 1700000 1700000
## 36 3500000 3500000
## 37 1800000 1800000
## 38 1800000 1800000
## 39 1200000 1200000
## 40 1625000 1625000
## 41 1400000 1400000
## 42 1600000 1600000
## 43 2398000 2398000
## 44 1300000 1300000
## 45 3166000 3166000
## 46 7000000 7000000
## 47 2300000 2300000
## 48 2500000 2500000
## 49 1200000 1200000
## 50 2300000 2300000
## 51 990000 990000
## 52 2090000 2090000
## 53 2000000 2000000
## 54 1800000 1800000
## 55 1600000 1600000
## 56 2100000 2100000
## 57 1900000 1900000
## 58 1800000 1800000
## 59 2200000 2200000
## 60 1300000 1300000
## 61 1800000 1800000
## 62 2500000 2500000
## 63 1700000 1700000
## 64 1800000 1800000
## 65 3000000 3000000
## 66 3000000 3000000
## 67 3562000 3562000
## 68 5100000 5100000
## 69 3300000 3300000
## 71 1200000 1200000
## 72 1400000 1400000
## 73 3000000 3000000
## 74 2000000 2000000
## 75 2600000 2600000
## 76 7300000 7300000
## 77 2500000 2500000
## 78 2700000 2700000
## 80 4200000 4200000
## 81 900000 900000
## 82 2650000 2650000
## 83 5000000 5000000
## 84 2500000 2500000
## 85 2500000 2500000
## 86 1685000 1685000
## 87 6600000 6600000
## 88 2500000 2500000
## 89 1600000 1600000
## 90 1800000 1800000
## 91 2600000 2600000
## 92 3900000 3900000
## 93 2000000 2000000
## 94 1400000 1400000
## 95 2200000 2200000
## 96 2000000 2000000
## 97 3500000 3500000
## 98 2500000 2500000
## 99 3200000 3200000
## 100 2030000 2030000
## 101 3500000 3500000
## 102 6500000 6500000
## 103 2400000 2400000
## 104 4000000 4000000
## 105 3000000 3000000
## 106 1300000 1300000
## 107 1300000 1300000
## 108 1200000 1200000
## 109 1250000 1250000
## 110 2100000 2100000
## 111 3000000 3000000
## 112 1600000 1600000
## 113 2800000 2800000
## 115 3000000 3000000
## 116 2500000 2500000
## 117 1600000 1600000
## 118 7200000 7200000
## 119 1500000 1500000
## 120 3100000 3100000
## 121 2800000 2800000
## 122 2450000 2450000
## 123 2050000 2050000
## 124 2000000 2000000
## 125 3470000 3470000
## 127 1500000 1500000
## 129 2100000 2100000
## 130 2100000 2100000
## 131 2300000 2300000
## 132 1050000 1050000
## 133 3800000 3800000
## 134 1800000 1800000
## 135 2300000 2300000
## 136 2400000 2400000
## 137 900000 900000
## 138 1500000 1500000
## 139 2100000 2100000
## 140 3600000 3600000
## 141 2800000 2800000
## 142 2300000 2300000
## 143 1550000 1550000
## 144 2550000 2550000
## 145 3000000 3000000
## 146 4492000 4492000
## 147 1750000 1750000
## 148 2762000 2762000
## 149 3200000 3200000
## 150 2600000 2600000
## 151 4100000 4100000
## 152 1400000 1400000
## 153 4600000 4600000
## 154 1400000 1400000
## 155 2300000 2300000
## 156 1838000 1838000
## 157 3100000 3100000
## 158 1800000 1800000
## 159 3000000 3000000
## 160 3800000 3800000
## 161 4100000 4100000
## 162 2800000 2800000
## 163 4100000 4100000
## 164 5700000 5700000
## 165 3000000 3000000
## 166 3900000 3900000
## 167 7000000 7000000
## 168 4000000 4000000
## 169 3600000 3600000
## 170 5500000 5500000
## 171 2400000 2400000
## 172 3800000 3800000
## 173 1900000 1900000
## 174 2100000 2100000
## 175 2600000 2600000
## 176 1300000 1300000
## 177 4000000 4000000
## 178 1900000 1900000
## 179 2150000 2150000
## 180 2300000 2300000
## 181 2400000 2400000
## 182 3300000 3300000
## 183 1650000 1650000
## 184 2700000 2700000
## 185 5600000 5600000
## 186 1700000 1700000
## 187 1300000 1300000
## 188 1100000 1100000
## 189 2700000 2700000
## 190 1500000 1500000
## 191 2900000 2900000
## 192 2300000 2300000
## 193 2000000 2000000
## 194 1600000 1600000
## 195 2400000 2400000
## 196 2800000 2800000
## 197 2500000 2500000
## 198 4200000 4200000
## 199 2100000 2100000
## 200 2800000 2800000
## 201 2800000 2800000
## 202 3900000 3900000
## 203 2000000 2000000
## 204 7300000 7300000
## 205 7500000 7500000
## 206 2800000 2800000
## 207 2800000 2800000
## 208 4149395 4149395
## 209 1700000 1700000
## 210 2800000 2800000
## 211 2500000 2500000
## 212 3500000 3500000
## 213 4000000 4000000
## 214 7400000 7400000
## 215 1600000 1600000
## 216 3025000 3025000
## 217 2550000 2550000
## 218 2900000 2900000
## 219 2525000 2525000
## 220 1900000 1900000
## 221 2520720 2520720
## 222 6600000 6600000
## 223 1423500 1423500
## 224 2200000 2200000
## 225 2500000 2500000
## 226 2050000 2050000
## 227 2000000 2000000
## 228 3200000 3200000
## 229 2400000 2400000
## 230 1900000 1900000
## 231 4600000 4600000
## 232 2800000 2800000
## 233 2200000 2200000
## 234 2100000 2100000
## 235 2000000 2000000
## 236 3100000 3100000
## 237 1400000 1400000
## 238 5492000 5492000
## 239 2100000 2100000
## 240 1350000 1350000
## 241 2000000 2000000
## 242 1550000 1550000
## 243 1450000 1450000
## 244 1750000 1750000
## 245 3250000 3250000
## 246 1800000 1800000
## 247 2300000 2300000
## 248 1400000 1400000
## 249 6900000 6900000
## 250 3300000 3300000
## 251 1600000 1600000
## 252 4000000 4000000
## 253 1600000 1600000
## 254 1500000 1500000
## 255 5900000 5900000
## 256 1749000 1749000
## 257 2800000 2800000
## 258 1250000 1250000
## 259 1738000 1738000
## 260 1700000 1700000
## 261 1400000 1400000
## 262 4000000 4000000
## 263 1700000 1700000
## 264 2050000 2050000
## 265 2200000 2200000
## 266 2800000 2800000
## 267 1800000 1800000
## 268 1600000 1600000
## 269 2400000 2400000
## 270 1400000 1400000
## 271 3600000 3600000
## 272 1900000 1900000
## 273 2250000 2250000
## 274 2900000 2900000
## 275 1700000 1700000
## 276 1900000 1900000
## 277 2500000 2500000
## 278 6800000 6800000
## 279 1200000 1200000
## 280 6700000 6700000
## 281 2000000 2000000
## 282 5000000 5000000
## 283 2700000 2700000
## 284 1930000 1930000
## 285 1600000 1600000
## 286 4000000 4000000
## 287 2385000 2385000
## 288 3600000 3600000
## 289 1600000 1600000
## 290 2300000 2300000
## 291 1600000 1600000
## 292 1500000 1500000
## 293 1400000 1400000
## 294 1300000 1300000
## 295 1700000 1700000
## 296 1400000 1400000
## 297 1450000 1450000
## 298 2200000 2200000
## 299 3100000 3100000
## 300 2600000 2600000
## 301 1500000 1500000
## 302 1500000 1500000
## 303 1800000 1800000
## 304 3200000 3200000
## 305 3200000 3200000
## 306 1900000 1900000
## 307 6500000 6500000
## 308 1400000 1400000
## 309 4200000 4200000
## 310 1500000 1500000
## 312 2100000 2100000
## 313 2300000 2300000
## 314 1743214 1743214
## 315 1850000 1850000
## 316 7000000 7000000
## 317 4000000 4000000
## 318 5600000 5600000
## 319 2400000 2400000
## 320 2970000 2970000
## 321 1600000 1600000
## 322 2525000 2525000
## 323 2500000 2500000
## 324 2000000 2000000
## 325 3100000 3100000
## 326 5500000 5500000
## 327 2038000 2038000
## 328 3600000 3600000
## 329 1800000 1800000
## 330 1400000 1400000
## 331 4000000 4000000
## 332 2400000 2400000
## 334 6500000 6500000
## 335 2900000 2900000
## 336 1500000 1500000
## 337 2000000 2000000
## 338 2800000 2800000
## 339 890000 890000
## 340 2800000 2800000
## 341 3200000 3200000
## 342 2300000 2300000
## 343 1800000 1800000
## 344 1449000 1449000
## 345 5500000 5500000
## 346 2000000 2000000
## 347 6300000 6300000
## 348 1600000 1600000
## 349 1500000 1500000
## 350 2000000 2000000
## 351 5800000 5800000
## 352 2000000 2000000
## 353 1500000 1500000
## 354 4800000 4800000
## 355 2500000 2500000
## 356 2700000 2700000
## 357 3500000 3500000
## 358 1998000 1998000
## 359 4100000 4100000
## 360 990000 990000
## 361 2200000 2200000
## 362 1700000 1700000
## 363 2600000 2600000
## 364 3300000 3300000
## 365 2500000 2500000
## 366 3200000 3200000
## 367 3300000 3300000
## 368 2550000 2550000
## 369 1800000 1800000
## 370 6000000 6000000
## 371 3100000 3100000
## 372 1800000 1800000
## 373 3600000 3600000
## 375 1526000 1526000
## 376 1150000 1150000
## 377 1700000 1700000
## 378 5100000 5100000
## 379 2100000 2100000
## 380 2325000 2325000
## 381 6300000 6300000
## 382 NA 2800000
## 383 1900000 1900000
## 384 2500000 2500000
## 385 1700000 1700000
## 386 2000000 2000000
## 387 1500000 1500000
## 388 2400000 2400000
## 389 2950000 2950000
## 390 2300000 2300000
## 391 2300000 2300000
## 392 2300000 2300000
## 393 1600000 1600000
## 394 2000000 2000000
## 395 NA 1600000
## 396 2100000 2100000
## 397 1700000 1700000
## 398 2000000 2000000
## 399 6700000 6700000
## 400 2000000 2000000
## 401 2500000 2500000
## 402 1400000 1400000
## 403 2400000 2400000
## 404 1800000 1800000
## 405 1326000 1326000
## 406 2540000 2540000
## 407 2600000 2600000
## 408 2900000 2900000
## 409 2800000 2800000
## 410 1223500 1223500
## 411 NA 1423500
## 412 6600000 6600000
## 413 2000000 2000000
## 414 2000000 2000000
## 415 2300000 2300000
## 416 1300000 1300000
## 417 3766000 3766000
## 418 3100000 3100000
## 419 1625000 1625000
## 420 2500000 2500000
## 421 2500000 2500000
## 422 2800000 2800000
## 423 3200000 3200000
## 424 1800000 1800000
## 425 1400000 1400000
## 426 2700000 2700000
## 427 NA 2550000
## 428 2050000 2050000
## 429 2200000 2200000
## 430 2000000 2000000
## 431 2050000 2050000
## 432 1900000 1900000
## 433 7100000 7100000
## 434 1700000 1700000
## 435 2000000 2000000
## 436 2100000 2100000
## 437 7100000 7100000
## 438 3100000 3100000
## 439 2500000 2500000
## 440 6000000 6000000
## 441 1650000 1650000
## 442 2100000 2100000
## 443 2430000 2430000
## 444 2500000 2500000
## 445 2700000 2700000
## 446 2400000 2400000
## 447 2025000 2025000
## 448 NA 2700000
## 449 3662000 3662000
## 450 2900000 2900000
## 451 2100000 2100000
## 452 1700000 1700000
## 453 1600000 1600000
## 454 1423500 1423500
## 455 1100000 1100000
## 456 2100000 2100000
## 457 3949395 3949395
## 458 2600000 2600000
## 459 4000000 4000000
## 460 NA 2100000
## 461 3050000 3050000
## 462 7500000 7500000
## 463 3300000 3300000
## 464 2100000 2100000
## 465 2900000 2900000
## 466 2000000 2000000
## 467 2750000 2750000
## 468 3500000 3500000
## 469 2525000 2525000
## 470 1300000 1300000
## 471 1500000 1500000
## 472 2300000 2300000
## 473 1600000 1600000
## 474 6000000 6000000
## 475 2200000 2200000
## 476 2500000 2500000
## 477 NA 2950000
## 478 3200000 3200000
## 479 7000000 7000000
## 480 2025000 2025000
## 481 890000 890000
## 482 1700000 1700000
## 483 4400000 4400000
## 484 1743214 1743214
## 485 3462000 3462000
## 486 NA 1500000
## 487 2700000 2700000
## 488 1700000 1700000
## 489 3000000 3000000
## 490 5209970 5209970
## 491 1200000 1200000
## 492 1500000 1500000
## 493 2900000 2900000
## 494 1800000 1800000
## 495 NA 3200000
## 496 2920720 2920720
## 497 2000000 2000000
## 498 7100000 7100000
## 499 1600000 1600000
## 500 2100000 2100000
## 501 2000000 2000000
## 502 5500000 5500000
## 503 1800000 1800000
## 504 2300000 2300000
## 505 1800000 1800000
## 506 2200000 2200000
## 507 1200000 1200000
## 508 1600000 1600000
## 509 NA 1500000
## 510 NA 1400000
## 511 4300000 4300000
## 512 6500000 6500000
## 513 5100000 5100000
## 514 3766000 3766000
## 515 2400000 2400000
## 517 2500000 2500000
## 518 4860000 4860000
## 519 1250000 1250000
## 520 1600000 1600000
## 521 3000000 3000000
## 522 1400000 1400000
## 523 NA 2600000
## 524 4400000 4400000
## 525 1000000 1000000
## 526 3100000 3100000
## 528 1290000 1290000
## 529 1600000 1600000
## 530 NA 2050000
## 531 2200000 2200000
## 532 1300000 1300000
## 533 2700000 2700000
## 534 1600000 1600000
## 535 2400000 2400000
## 536 3500000 3500000
## 537 3000000 3000000
## 538 1600000 1600000
## 539 2000000 2000000
## 540 4100000 4100000
## 541 1400000 1400000
## 542 1900000 1900000
## 543 6800000 6800000
## 544 1500000 1500000
## 545 2200000 2200000
## 546 7000000 7000000
## 547 1300000 1300000
## 548 1900000 1900000
## 549 1400000 1400000
## 550 NA 2100000
## 551 2800000 2800000
## 552 3200000 3200000
## 553 2500000 2500000
## 554 2500000 2500000
## 555 2500000 2500000
## 556 1900000 1900000
## 557 NA 1700000
## 558 2300000 2300000
## 559 3470000 3470000
## 560 2150000 2150000
## 561 3000000 3000000
## 562 1800000 1800000
## 563 1450000 1450000
## 564 1900000 1900000
## 565 3800000 3800000
## 566 5100000 5100000
## 567 NA 1500000
## 568 2200000 2200000
## 569 5400000 5400000
## 570 3000000 3000000
## 571 2300000 2300000
## 572 7000000 7000000
## 573 2400000 2400000
## 574 4000000 4000000
## 576 2200000 2200000
## 577 NA 1900000
## 578 2100000 2100000
## 579 NA 2550000
## 580 4200000 4200000
## 581 2250000 2250000
## 582 2400000 2400000
## 583 2200000 2200000
## 584 2100000 2100000
## 585 NA 1700000
## 586 3000000 3000000
## 587 4200000 4200000
## 588 2000000 2000000
## 589 7100000 7100000
## 590 1500000 1500000
## 591 2700000 2700000
## 592 3150000 3150000
## 593 4200000 4200000
## 594 3000000 3000000
## 595 1300000 1300000
## 596 1900000 1900000
## 597 NA 3100000
## 598 2500000 2500000
## 599 2200000 2200000
## 600 1450000 1450000
## 601 1250000 1250000
## 602 2178393 2178393
## 603 2000000 2000000
## 604 2820720 2820720
## 605 2000000 2000000
## 606 NA 2100000
## 607 1400000 1400000
## 608 2700000 2700000
## 610 1300000 1300000
## 611 1600000 1600000
## 612 4709970 4709970
## 613 NA 2500000
## 614 2100000 2100000
## 615 1500000 1500000
## 616 3200000 3200000
## 617 2100000 2100000
## 618 2800000 2800000
## 619 NA 1100000
## 620 2400000 2400000
## 621 4000000 4000000
## 622 1623500 1623500
## 623 2600000 2600000
## 624 1350000 1350000
## 625 4400000 4400000
## 626 3000000 3000000
## 627 900000 900000
## 628 1600000 1600000
## 629 1600000 1600000
## 630 NA 2050000
## 631 2620720 2620720
## 632 4460000 4460000
## 633 2700000 2700000
## 634 2300000 2300000
## 635 2600000 2600000
## 636 2650000 2650000
## 637 1750000 1750000
## 638 2900000 2900000
## 639 2500000 2500000
## 640 1700000 1700000
## 641 NA 2500000
## 642 2900000 2900000
## 643 2000000 2000000
## 644 2500000 2500000
## 645 1700000 1700000
## 646 2300000 2300000
## 647 1443214 1443214
## 648 2400000 2400000
## 649 1700000 1700000
## 650 1100000 1100000
## 651 1700000 1700000
## 652 5200000 5200000
## 653 2600000 2600000
## 654 2500000 2500000
## 655 NA 1600000
## 656 2500000 2500000
## 657 1600000 1600000
## 658 7000000 7000000
## 659 2400000 2400000
## 660 2000000 2000000
## 662 2038000 2038000
## 663 4700000 4700000
## 664 2243214 2243214
## 665 3000000 3000000
## 666 3900000 3900000
## 667 1810000 1810000
## 668 1500000 1500000
## 669 2400000 2400000
## 670 3200000 3200000
## 671 2700000 2700000
## 673 4000000 4000000
## 674 2100000 2100000
## 675 3000000 3000000
## 676 3600000 3600000
## 677 NA 1900000
## 678 3500000 3500000
## 679 2400000 2400000
## 680 7200000 7200000
## 681 1700000 1700000
## 682 2300000 2300000
## 683 1150000 1150000
## 684 2700000 2700000
## 685 2300000 2300000
## 686 1700000 1700000
## 687 4500000 4500000
## 688 6500000 6500000
## 689 2320720 2320720
## 690 2300000 2300000
## 691 1700000 1700000
## 692 1500000 1500000
## 693 2100000 2100000
## 694 3300000 3300000
## 695 1549000 1549000
## 696 2100000 2100000
## 697 2500000 2500000
## 698 2800000 2800000
## 699 2300000 2300000
## 700 1700000 1700000
## 701 3800000 3800000
## 702 2000000 2000000
## 703 NA 3500000
## 704 2050000 2050000
## 705 5100000 5100000
## 706 2100000 2100000
## 707 2200000 2200000
## 708 1300000 1300000
## 709 5700000 5700000
## 710 2950000 2950000
## 711 1900000 1900000
## 712 2500000 2500000
## 713 3000000 3000000
## 714 1800000 1800000
## 715 2600000 2600000
## 716 4000000 4000000
## 717 NA 2300000
## 718 1400000 1400000
## 719 7000000 7000000
## 720 4000000 4000000
## 721 1800000 1800000
## 722 2350000 2350000
## 723 2200000 2200000
## 724 2500000 2500000
## 725 3000000 3000000
## 726 2300000 2300000
## 727 1900000 1900000
## 728 3500000 3500000
## 729 1500000 1500000
## 730 2150000 2150000
## 731 3800000 3800000
## 732 NA 2198000
## 733 3900000 3900000
## 734 4000000 4000000
## 735 2000000 2000000
## 736 4200000 4200000
## 737 1600000 1600000
## 738 1250000 1250000
## 739 1000000 1000000
## 740 4100000 4100000
## 741 2700000 2700000
## 742 1840000 1840000
## 743 2200000 2200000
## 744 1423500 1423500
## 745 2198000 2198000
## 746 1800000 1800000
## 747 4100000 4100000
## 748 1900000 1900000
## 749 1126000 1126000
## 750 1400000 1400000
## 751 1900000 1900000
## 752 2298000 2298000
## 753 5700000 5700000
## 754 NA 2800000
## 755 1300000 1300000
## 756 1400000 1400000
## 757 1600000 1600000
## 758 1200000 1200000
## 759 3300000 3300000
## 760 2600000 2600000
## 761 2800000 2800000
## 762 NA 2500000
## 763 2000000 2000000
## 764 3770000 3770000
## 765 1100000 1100000
## 766 3500000 3500000
## 767 NA 1700000
## 768 2300000 2300000
## 769 1600000 1600000
## 770 2800000 2800000
## 771 2800000 2800000
## 772 3500000 3500000
## 773 1026000 1026000
## 774 1600000 1600000
## 775 1900000 1900000
## 776 NA 2200000
## 777 2300000 2300000
## 778 1260000 1260000
## 779 1600000 1600000
## 780 3000000 3000000
## 781 3000000 3000000
## 782 2300000 2300000
## 783 3300000 3300000
## 784 2400000 2400000
## 785 NA 4000000
## 786 7000000 7000000
## 787 660000 660000
## 788 4000000 4000000
## 789 2500000 2500000
## 790 1600000 1600000
## 791 2100000 2100000
## 792 1300000 1300000
## 793 NA 2000000
## 794 3100000 3100000
## 795 2900000 2900000
## 796 2600000 2600000
## 797 1600000 1600000
## 798 1400000 1400000
## 800 2700000 2700000
## 801 7200000 7200000
## 802 NA 2200000
## 803 NA 2800000
## 804 1350000 1350000
## 805 4700000 4700000
## 806 1243214 1243214
## 807 990000 990000
## 808 2400000 2400000
## 809 1600000 1600000
## 810 5200000 5200000
## 811 NA 1600000
## 812 2300000 2300000
## 813 2500000 2500000
## 814 3300000 3300000
## 815 1200000 1200000
## 816 2500000 2500000
## 817 2000000 2000000
## 818 2000000 2000000
## 819 NA 2400000
## 820 7200000 7200000
## 821 2350000 2350000
## 822 4400000 4400000
## 823 2000000 2000000
## 824 1100000 1100000
## 825 6600000 6600000
## 826 1600000 1600000
## 827 5700000 5700000
## 828 4349395 4349395
## 829 NA 2500000
## 830 3500000 3500000
## 831 2600000 2600000
## 832 1600000 1600000
## 833 6500000 6500000
## 834 2300000 2300000
## 835 2500000 2500000
sd(cred[,10], na.rm=TRUE); sd(cred[,29])# Las varianzas son similares,mantuvo la estructura de la distribución
## [1] 1399604
## [1] 1374540
pred <- c("estrato","ingresosimp", "edad","dtscore", "cuota", "antiguedad_laboral","personas_hogar")# seleccion de predictores
cred_se <- cred[, pred]
names(cred_se)
## [1] "estrato" "ingresosimp" "edad"
## [4] "dtscore" "cuota" "antiguedad_laboral"
## [7] "personas_hogar"
summary(cred_se)
## estrato ingresosimp edad dtscore
## Min. :1.000 Min. : 660000 Min. :19.00 Min. :276.0
## 1st Qu.:1.000 1st Qu.:1700000 1st Qu.:24.00 1st Qu.:695.0
## Median :2.000 Median :2300000 Median :29.00 Median :732.0
## Mean :2.098 Mean :2660288 Mean :30.21 Mean :710.1
## 3rd Qu.:3.000 3rd Qu.:3000000 3rd Qu.:34.00 3rd Qu.:762.0
## Max. :4.000 Max. :7500000 Max. :69.00 Max. :883.0
## NA's :35 NA's :90
## cuota antiguedad_laboral personas_hogar
## Min. : 126000 Min. : 1.000 Min. :2.000
## 1st Qu.: 353700 1st Qu.: 2.000 1st Qu.:3.000
## Median : 422699 Median : 5.000 Median :3.000
## Mean : 435790 Mean : 6.589 Mean :3.219
## 3rd Qu.: 488142 3rd Qu.: 9.000 3rd Qu.:4.000
## Max. :2036384 Max. :35.000 Max. :5.000
##
# identificar N/A
uv <- c(which(is.na(cred_se$estrato)), which(is.na(cred_se$dtscore)))
### ---- determinar numero de grupos
tem <- scale(cred_se[-uv,]) # estandarizar variables
dim(tem)
## [1] 697 7
### Matriz de distancias
cdistan<- dist(tem, method = "euclidean")## method=manhattan, sorensen, bray-curtis, gower
## estimar k optimos
fviz_nbclust(tem, kmeans, method = "wss", k.max=12) +
geom_vline(xintercept = 3, linetype = 2)

fviz_nbclust(tem, kmeans, method = "silhouette", k.max=12) +
geom_vline(xintercept = 3, linetype = 2)

#KNN estimacion
cred_se$estrato <- as.numeric(cred_se$estrato)
cred_iset <- kNN(cred_se, variable = "estrato", k = 3, imp_var = FALSE)
## ingresosimp edad dtscore cuota
## 660000 19 276 126000
## antiguedad_laboral personas_hogar ingresosimp edad
## 1 2 7500000 69
## dtscore cuota antiguedad_laboral personas_hogar
## 883 2036384 35 5
cred$estratoimp <- round(cred_iset$estrato)
names(cred)
## [1] "number" "dtscore" "dtscore_aux"
## [4] "dtscoreco" "dtscoreco_aux" "edad"
## [7] "IBC" "labor" "calificacion"
## [10] "ingresos" "pendeuda" "co"
## [13] "credito" "yeardde" "tinteres"
## [16] "ncuotas" "genero" "cuota"
## [19] "capacidad" "antiguedad_laboral" "estrato"
## [22] "tipo_vivienda" "nivel_educativo" "creditos_activos"
## [25] "personas_hogar" "eactual" "atipicospendeuda"
## [28] "atipicoscapacidad" "ingresosimp" "estratoimp"
summary(cred)
## number dtscore dtscore_aux dtscoreco dtscoreco_aux
## Min. : 1.0 Min. :276.0 0:728 Min. :150.0 0:313
## 1st Qu.:212.2 1st Qu.:695.0 1: 90 1st Qu.:700.0 1:505
## Median :419.5 Median :732.0 Median :761.0
## Mean :419.4 Mean :710.1 Mean :741.1
## 3rd Qu.:627.8 3rd Qu.:762.0 3rd Qu.:817.0
## Max. :835.0 Max. :883.0 Max. :881.0
## NA's :90 NA's :505
## edad IBC labor calificacion
## Min. :19.00 Min. :3.300 EMPLEADO :702 B1:532
## 1st Qu.:24.00 1st Qu.:5.300 INDEPENDIENTE:116 C1:286
## Median :29.00 Median :5.900
## Mean :30.21 Mean :6.156
## 3rd Qu.:34.00 3rd Qu.:6.800
## Max. :69.00 Max. :9.000
##
## ingresos pendeuda co credito yeardde
## Min. : 660000 Min. :0.0000 0:521 Min. : 3803300 2022: 59
## 1st Qu.:1700000 1st Qu.:0.0000 1:297 1st Qu.:10045000 2023:149
## Median :2300000 Median :0.0750 Median :12150000 2024:308
## Mean :2683068 Mean :0.1273 Mean :12200356 2025:302
## 3rd Qu.:3050000 3rd Qu.:0.2100 3rd Qu.:14045000
## Max. :7500000 Max. :0.6300 Max. :60045000
## NA's :41 NA's :51
## tinteres ncuotas genero cuota
## Min. :0.01600 Min. :12.0 MASCULINO:656 Min. : 126000
## 1st Qu.:0.01870 1st Qu.:36.0 MUJER :162 1st Qu.: 353700
## Median :0.02080 Median :48.0 Median : 422699
## Mean :0.02215 Mean :51.3 Mean : 435790
## 3rd Qu.:0.02500 3rd Qu.:63.0 3rd Qu.: 488142
## Max. :0.03100 Max. :72.0 Max. :2036384
##
## capacidad antiguedad_laboral estrato tipo_vivienda
## Min. :0.0200 Min. : 1.000 Min. :1.000 arriendo :253
## 1st Qu.:0.1300 1st Qu.: 2.000 1st Qu.:1.000 propia :196
## Median :0.1800 Median : 5.000 Median :2.000 propia_pagando:369
## Mean :0.1851 Mean : 6.589 Mean :2.098
## 3rd Qu.:0.2300 3rd Qu.: 9.000 3rd Qu.:3.000
## Max. :0.5000 Max. :35.000 Max. :4.000
## NA's :35
## nivel_educativo creditos_activos personas_hogar eactual
## basica :165 Min. :1.000 Min. :2.000 0:695
## media :276 1st Qu.:1.000 1st Qu.:3.000 1:123
## superior :157 Median :2.000 Median :3.000
## tecnica/tecnologica:220 Mean :1.762 Mean :3.219
## 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :3.000 Max. :5.000
##
## atipicospendeuda atipicoscapacidad ingresosimp estratoimp
## Min. :0 Min. :0 Min. : 660000 Min. :1.000
## 1st Qu.:0 1st Qu.:0 1st Qu.:1700000 1st Qu.:1.000
## Median :0 Median :0 Median :2300000 Median :2.000
## Mean :0 Mean :0 Mean :2660288 Mean :2.092
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:3000000 3rd Qu.:3.000
## Max. :0 Max. :0 Max. :7500000 Max. :4.000
##
cred[,c(21,30)]# comparacion
## estrato estratoimp
## 1 3 3
## 2 2 2
## 3 2 2
## 4 1 1
## 6 3 3
## 7 3 3
## 8 1 1
## 9 1 1
## 10 2 2
## 11 1 1
## 12 2 2
## 13 1 1
## 14 3 3
## 15 3 3
## 16 1 1
## 17 1 1
## 18 1 1
## 19 1 1
## 20 1 1
## 21 1 1
## 22 4 4
## 24 2 2
## 25 1 1
## 26 1 1
## 27 2 2
## 28 1 1
## 29 3 3
## 30 2 2
## 31 1 1
## 32 2 2
## 33 1 1
## 34 4 4
## 35 1 1
## 36 4 4
## 37 1 1
## 38 1 1
## 39 3 3
## 40 1 1
## 41 2 2
## 42 2 2
## 43 1 1
## 44 2 2
## 45 2 2
## 46 1 1
## 47 3 3
## 48 1 1
## 49 1 1
## 50 1 1
## 51 1 1
## 52 1 1
## 53 2 2
## 54 3 3
## 55 1 1
## 56 2 2
## 57 4 4
## 58 1 1
## 59 1 1
## 60 2 2
## 61 1 1
## 62 2 2
## 63 1 1
## 64 1 1
## 65 3 3
## 66 4 4
## 67 3 3
## 68 1 1
## 69 1 1
## 71 1 1
## 72 1 1
## 73 2 2
## 74 3 3
## 75 1 1
## 76 2 2
## 77 1 1
## 78 2 2
## 80 4 4
## 81 1 1
## 82 1 1
## 83 3 3
## 84 3 3
## 85 1 1
## 86 3 3
## 87 3 3
## 88 1 1
## 89 1 1
## 90 2 2
## 91 2 2
## 92 3 3
## 93 1 1
## 94 2 2
## 95 1 1
## 96 2 2
## 97 1 1
## 98 3 3
## 99 2 2
## 100 4 4
## 101 1 1
## 102 4 4
## 103 3 3
## 104 4 4
## 105 4 4
## 106 2 2
## 107 1 1
## 108 2 2
## 109 1 1
## 110 1 1
## 111 NA 1
## 112 2 2
## 113 3 3
## 115 4 4
## 116 4 4
## 117 1 1
## 118 3 3
## 119 1 1
## 120 3 3
## 121 1 1
## 122 1 1
## 123 1 1
## 124 1 1
## 125 1 1
## 127 1 1
## 129 1 1
## 130 3 3
## 131 1 1
## 132 1 1
## 133 2 2
## 134 1 1
## 135 1 1
## 136 2 2
## 137 3 3
## 138 1 1
## 139 2 2
## 140 4 4
## 141 1 1
## 142 4 4
## 143 2 2
## 144 1 1
## 145 3 3
## 146 3 3
## 147 2 2
## 148 4 4
## 149 NA 2
## 150 3 3
## 151 4 4
## 152 1 1
## 153 4 4
## 154 3 3
## 155 3 3
## 156 2 2
## 157 4 4
## 158 1 1
## 159 1 1
## 160 1 1
## 161 3 3
## 162 1 1
## 163 2 2
## 164 2 2
## 165 NA 2
## 166 4 4
## 167 3 3
## 168 3 3
## 169 3 3
## 170 3 3
## 171 1 1
## 172 NA 3
## 173 1 1
## 174 1 1
## 175 2 2
## 176 1 1
## 177 2 2
## 178 2 2
## 179 2 2
## 180 2 2
## 181 2 2
## 182 NA 3
## 183 1 1
## 184 2 2
## 185 3 3
## 186 1 1
## 187 1 1
## 188 1 1
## 189 4 4
## 190 1 1
## 191 3 3
## 192 NA 1
## 193 1 1
## 194 4 4
## 195 1 1
## 196 2 2
## 197 3 3
## 198 3 3
## 199 NA 3
## 200 2 2
## 201 1 1
## 202 3 3
## 203 3 3
## 204 4 4
## 205 2 2
## 206 3 3
## 207 1 1
## 208 NA 4
## 209 1 1
## 210 4 4
## 211 4 4
## 212 3 3
## 213 NA 1
## 214 4 4
## 215 NA 1
## 216 3 3
## 217 3 3
## 218 2 2
## 219 1 1
## 220 2 2
## 221 2 2
## 222 4 4
## 223 1 1
## 224 2 2
## 225 4 4
## 226 3 3
## 227 1 1
## 228 2 2
## 229 1 1
## 230 2 2
## 231 2 2
## 232 NA 2
## 233 2 2
## 234 1 1
## 235 2 2
## 236 4 4
## 237 2 2
## 238 1 1
## 239 1 1
## 240 1 1
## 241 1 1
## 242 1 1
## 243 2 2
## 244 2 2
## 245 2 2
## 246 1 1
## 247 NA 2
## 248 3 3
## 249 3 3
## 250 2 2
## 251 1 1
## 252 3 3
## 253 2 2
## 254 3 3
## 255 3 3
## 256 1 1
## 257 4 4
## 258 2 2
## 259 2 2
## 260 2 2
## 261 1 1
## 262 3 3
## 263 1 1
## 264 4 4
## 265 3 3
## 266 2 2
## 267 1 1
## 268 1 1
## 269 2 2
## 270 3 3
## 271 2 2
## 272 2 2
## 273 1 1
## 274 3 3
## 275 1 1
## 276 1 1
## 277 1 1
## 278 3 3
## 279 1 1
## 280 3 3
## 281 2 2
## 282 3 3
## 283 2 2
## 284 4 4
## 285 2 2
## 286 4 4
## 287 2 2
## 288 3 3
## 289 3 3
## 290 2 2
## 291 1 1
## 292 1 1
## 293 1 1
## 294 2 2
## 295 1 1
## 296 2 2
## 297 2 2
## 298 3 3
## 299 1 1
## 300 3 3
## 301 1 1
## 302 1 1
## 303 2 2
## 304 3 3
## 305 2 2
## 306 2 2
## 307 3 3
## 308 1 1
## 309 2 2
## 310 1 1
## 312 4 4
## 313 3 3
## 314 4 4
## 315 1 1
## 316 2 2
## 317 3 3
## 318 2 2
## 319 1 1
## 320 2 2
## 321 3 3
## 322 3 3
## 323 3 3
## 324 2 2
## 325 1 1
## 326 3 3
## 327 2 2
## 328 3 3
## 329 1 1
## 330 3 3
## 331 3 3
## 332 3 3
## 334 2 2
## 335 3 3
## 336 1 1
## 337 3 3
## 338 1 1
## 339 1 1
## 340 4 4
## 341 1 1
## 342 2 2
## 343 2 2
## 344 1 1
## 345 4 4
## 346 2 2
## 347 4 4
## 348 3 3
## 349 1 1
## 350 1 1
## 351 4 4
## 352 3 3
## 353 1 1
## 354 2 2
## 355 2 2
## 356 1 1
## 357 3 3
## 358 1 1
## 359 3 3
## 360 1 1
## 361 1 1
## 362 1 1
## 363 2 2
## 364 1 1
## 365 1 1
## 366 1 1
## 367 3 3
## 368 4 4
## 369 3 3
## 370 3 3
## 371 1 1
## 372 1 1
## 373 3 3
## 375 4 4
## 376 1 1
## 377 3 3
## 378 4 4
## 379 1 1
## 380 2 2
## 381 3 3
## 382 4 4
## 383 3 3
## 384 3 3
## 385 1 1
## 386 2 2
## 387 1 1
## 388 4 4
## 389 1 1
## 390 1 1
## 391 1 1
## 392 3 3
## 393 1 1
## 394 1 1
## 395 1 1
## 396 1 1
## 397 1 1
## 398 1 1
## 399 4 4
## 400 2 2
## 401 2 2
## 402 1 1
## 403 3 3
## 404 3 3
## 405 4 4
## 406 2 2
## 407 2 2
## 408 4 4
## 409 3 3
## 410 2 2
## 411 1 1
## 412 4 4
## 413 1 1
## 414 1 1
## 415 2 2
## 416 1 1
## 417 2 2
## 418 1 1
## 419 2 2
## 420 2 2
## 421 1 1
## 422 1 1
## 423 3 3
## 424 1 1
## 425 3 3
## 426 3 3
## 427 3 3
## 428 1 1
## 429 2 2
## 430 1 1
## 431 1 1
## 432 2 2
## 433 4 4
## 434 4 4
## 435 3 3
## 436 1 1
## 437 4 4
## 438 3 3
## 439 2 2
## 440 2 2
## 441 1 1
## 442 1 1
## 443 4 4
## 444 4 4
## 445 2 2
## 446 1 1
## 447 3 3
## 448 2 2
## 449 3 3
## 450 2 2
## 451 1 1
## 452 2 2
## 453 1 1
## 454 1 1
## 455 1 1
## 456 2 2
## 457 4 4
## 458 4 4
## 459 4 4
## 460 1 1
## 461 2 2
## 462 3 3
## 463 4 4
## 464 2 2
## 465 2 2
## 466 2 2
## 467 4 4
## 468 2 2
## 469 2 2
## 470 1 1
## 471 2 2
## 472 2 2
## 473 1 1
## 474 2 2
## 475 3 3
## 476 3 3
## 477 4 4
## 478 1 1
## 479 3 3
## 480 1 1
## 481 1 1
## 482 2 2
## 483 2 2
## 484 3 3
## 485 2 2
## 486 1 1
## 487 2 2
## 488 1 1
## 489 3 3
## 490 1 1
## 491 2 2
## 492 3 3
## 493 4 4
## 494 2 2
## 495 4 4
## 496 3 3
## 497 2 2
## 498 3 3
## 499 4 4
## 500 3 3
## 501 1 1
## 502 3 3
## 503 1 1
## 504 3 3
## 505 2 2
## 506 1 1
## 507 1 1
## 508 2 2
## 509 1 1
## 510 1 1
## 511 1 1
## 512 2 2
## 513 2 2
## 514 1 1
## 515 1 1
## 517 2 2
## 518 1 1
## 519 1 1
## 520 1 1
## 521 4 4
## 522 4 4
## 523 2 2
## 524 3 3
## 525 1 1
## 526 1 1
## 528 2 2
## 529 2 2
## 530 1 1
## 531 3 3
## 532 1 1
## 533 2 2
## 534 1 1
## 535 1 1
## 536 4 4
## 537 1 1
## 538 1 1
## 539 1 1
## 540 3 3
## 541 3 3
## 542 3 3
## 543 3 3
## 544 2 2
## 545 4 4
## 546 3 3
## 547 1 1
## 548 2 2
## 549 2 2
## 550 1 1
## 551 4 4
## 552 4 4
## 553 3 3
## 554 3 3
## 555 2 2
## 556 1 1
## 557 1 1
## 558 3 3
## 559 1 1
## 560 1 1
## 561 3 3
## 562 2 2
## 563 1 1
## 564 1 1
## 565 3 3
## 566 4 4
## 567 1 1
## 568 1 1
## 569 4 4
## 570 2 2
## 571 1 1
## 572 3 3
## 573 2 2
## 574 2 2
## 576 1 1
## 577 2 2
## 578 NA 1
## 579 1 1
## 580 3 3
## 581 NA 4
## 582 1 1
## 583 2 2
## 584 2 2
## 585 1 1
## 586 1 1
## 587 3 3
## 588 2 2
## 589 3 3
## 590 1 1
## 591 2 2
## 592 1 1
## 593 2 2
## 594 NA 2
## 595 1 1
## 596 1 1
## 597 3 3
## 598 1 1
## 599 4 4
## 600 1 1
## 601 1 1
## 602 1 1
## 603 1 1
## 604 3 3
## 605 3 3
## 606 1 1
## 607 2 2
## 608 1 1
## 610 2 2
## 611 1 1
## 612 1 1
## 613 2 2
## 614 2 2
## 615 2 2
## 616 NA 2
## 617 3 3
## 618 1 1
## 619 1 1
## 620 2 2
## 621 3 3
## 622 1 1
## 623 2 2
## 624 1 1
## 625 3 3
## 626 1 1
## 627 1 1
## 628 3 3
## 629 1 1
## 630 1 1
## 631 3 3
## 632 2 2
## 633 1 1
## 634 2 2
## 635 3 3
## 636 3 3
## 637 1 1
## 638 3 3
## 639 3 3
## 640 1 1
## 641 4 4
## 642 1 1
## 643 1 1
## 644 1 1
## 645 3 3
## 646 1 1
## 647 1 1
## 648 1 1
## 649 3 3
## 650 NA 2
## 651 3 3
## 652 4 4
## 653 1 1
## 654 3 3
## 655 2 2
## 656 1 1
## 657 1 1
## 658 2 2
## 659 2 2
## 660 3 3
## 662 3 3
## 663 NA 3
## 664 3 3
## 665 1 1
## 666 3 3
## 667 2 2
## 668 2 2
## 669 1 1
## 670 4 4
## 671 3 3
## 673 4 4
## 674 1 1
## 675 3 3
## 676 2 2
## 677 1 1
## 678 4 4
## 679 1 1
## 680 3 3
## 681 2 2
## 682 4 4
## 683 1 1
## 684 1 1
## 685 NA 1
## 686 3 3
## 687 3 3
## 688 4 4
## 689 4 4
## 690 2 2
## 691 4 4
## 692 2 2
## 693 1 1
## 694 2 2
## 695 1 1
## 696 3 3
## 697 3 3
## 698 3 3
## 699 2 2
## 700 NA 1
## 701 3 3
## 702 2 2
## 703 4 4
## 704 1 1
## 705 3 3
## 706 1 1
## 707 3 3
## 708 1 1
## 709 3 3
## 710 4 4
## 711 2 2
## 712 4 4
## 713 NA 2
## 714 1 1
## 715 2 2
## 716 2 2
## 717 3 3
## 718 2 2
## 719 4 4
## 720 4 4
## 721 1 1
## 722 NA 3
## 723 1 1
## 724 4 4
## 725 1 1
## 726 2 2
## 727 1 1
## 728 NA 1
## 729 1 1
## 730 NA 1
## 731 4 4
## 732 2 2
## 733 3 3
## 734 3 3
## 735 NA 3
## 736 2 2
## 737 3 3
## 738 1 1
## 739 1 1
## 740 4 4
## 741 2 2
## 742 3 3
## 743 1 1
## 744 1 1
## 745 2 2
## 746 NA 1
## 747 3 3
## 748 2 2
## 749 4 4
## 750 1 1
## 751 2 2
## 752 2 2
## 753 2 2
## 754 NA 1
## 755 1 1
## 756 1 1
## 757 1 1
## 758 3 3
## 759 1 1
## 760 2 2
## 761 3 3
## 762 2 2
## 763 1 1
## 764 1 1
## 765 3 3
## 766 3 3
## 767 1 1
## 768 1 1
## 769 NA 1
## 770 1 1
## 771 2 2
## 772 4 4
## 773 3 3
## 774 1 1
## 775 1 1
## 776 4 4
## 777 2 2
## 778 1 1
## 779 NA 1
## 780 4 4
## 781 3 3
## 782 3 3
## 783 1 1
## 784 1 1
## 785 4 4
## 786 2 2
## 787 1 1
## 788 2 2
## 789 1 1
## 790 1 1
## 791 NA 1
## 792 1 1
## 793 1 1
## 794 3 3
## 795 2 2
## 796 3 3
## 797 1 1
## 798 3 3
## 800 4 4
## 801 3 3
## 802 3 3
## 803 3 3
## 804 NA 2
## 805 4 4
## 806 3 3
## 807 1 1
## 808 1 1
## 809 NA 3
## 810 4 4
## 811 1 1
## 812 1 1
## 813 4 4
## 814 2 2
## 815 1 1
## 816 4 4
## 817 NA 2
## 818 1 1
## 819 3 3
## 820 4 4
## 821 1 1
## 822 NA 2
## 823 2 2
## 824 1 1
## 825 4 4
## 826 1 1
## 827 3 3
## 828 1 1
## 829 1 1
## 830 3 3
## 831 NA 3
## 832 2 2
## 833 3 3
## 834 4 4
## 835 1 1
sd(cred[,21], na.rm=TRUE); sd(cred[,30])# Las varianzas son similares, mantuvo la estructura de la distribución
## [1] 1.061671
## [1] 1.056613
cred$credito<- ifelse(is.na(cred$credito),cred$cuota*((1-(1+cred$tinteres)^(-cred$ncuotas))/(cred$tinteres)),cred$credito)
cred<- cred[, setdiff(names(cred), c("ingresos","estrato"))]
cred<- cred[, setdiff(names(cred), c("atipicospendeuda","atipicoscapacidad"))]
cred<- cred[, setdiff(names(cred), c("yeardde"))]
summary(cred)
## number dtscore dtscore_aux dtscoreco dtscoreco_aux
## Min. : 1.0 Min. :276.0 0:728 Min. :150.0 0:313
## 1st Qu.:212.2 1st Qu.:695.0 1: 90 1st Qu.:700.0 1:505
## Median :419.5 Median :732.0 Median :761.0
## Mean :419.4 Mean :710.1 Mean :741.1
## 3rd Qu.:627.8 3rd Qu.:762.0 3rd Qu.:817.0
## Max. :835.0 Max. :883.0 Max. :881.0
## NA's :90 NA's :505
## edad IBC labor calificacion
## Min. :19.00 Min. :3.300 EMPLEADO :702 B1:532
## 1st Qu.:24.00 1st Qu.:5.300 INDEPENDIENTE:116 C1:286
## Median :29.00 Median :5.900
## Mean :30.21 Mean :6.156
## 3rd Qu.:34.00 3rd Qu.:6.800
## Max. :69.00 Max. :9.000
##
## pendeuda co credito tinteres ncuotas
## Min. :0.0000 0:521 Min. : 3803300 Min. :0.01600 Min. :12.0
## 1st Qu.:0.0000 1:297 1st Qu.:10045000 1st Qu.:0.01870 1st Qu.:36.0
## Median :0.0750 Median :11954000 Median :0.02080 Median :48.0
## Mean :0.1273 Mean :12237124 Mean :0.02215 Mean :51.3
## 3rd Qu.:0.2100 3rd Qu.:14045000 3rd Qu.:0.02500 3rd Qu.:63.0
## Max. :0.6300 Max. :60045000 Max. :0.03100 Max. :72.0
##
## genero cuota capacidad antiguedad_laboral
## MASCULINO:656 Min. : 126000 Min. :0.0200 Min. : 1.000
## MUJER :162 1st Qu.: 353700 1st Qu.:0.1300 1st Qu.: 2.000
## Median : 422699 Median :0.1800 Median : 5.000
## Mean : 435790 Mean :0.1851 Mean : 6.589
## 3rd Qu.: 488142 3rd Qu.:0.2300 3rd Qu.: 9.000
## Max. :2036384 Max. :0.5000 Max. :35.000
##
## tipo_vivienda nivel_educativo creditos_activos
## arriendo :253 basica :165 Min. :1.000
## propia :196 media :276 1st Qu.:1.000
## propia_pagando:369 superior :157 Median :2.000
## tecnica/tecnologica:220 Mean :1.762
## 3rd Qu.:2.000
## Max. :3.000
##
## personas_hogar eactual ingresosimp estratoimp
## Min. :2.000 0:695 Min. : 660000 Min. :1.000
## 1st Qu.:3.000 1:123 1st Qu.:1700000 1st Qu.:1.000
## Median :3.000 Median :2300000 Median :2.000
## Mean :3.219 Mean :2660288 Mean :2.092
## 3rd Qu.:4.000 3rd Qu.:3000000 3rd Qu.:3.000
## Max. :5.000 Max. :7500000 Max. :4.000
##
dim(cred)
## [1] 818 25
names(cred)
## [1] "number" "dtscore" "dtscore_aux"
## [4] "dtscoreco" "dtscoreco_aux" "edad"
## [7] "IBC" "labor" "calificacion"
## [10] "pendeuda" "co" "credito"
## [13] "tinteres" "ncuotas" "genero"
## [16] "cuota" "capacidad" "antiguedad_laboral"
## [19] "tipo_vivienda" "nivel_educativo" "creditos_activos"
## [22] "personas_hogar" "eactual" "ingresosimp"
## [25] "estratoimp"
##### ----------- ooo -- ooo --------- #######
####Seleccion de variables Escalar-Escalar
number <- cred[,c("dtscore","dtscoreco","edad","IBC","pendeuda","credito","tinteres","capacidad","cuota","antiguedad_laboral","creditos_activos","personas_hogar","ingresosimp","estratoimp")]
number <- number[, sapply(number, function(x) sd(na.omit(x)) > 0.001 | is.factor(x))]
M = cor(number,use = "complete.obs") #"complete.obs" excluye NA
corrplot(M, method = 'number') # colorful number

#relación positiva fuerte entre estrato y antiguedad_laboral, se puede elimina una de ellas
#relación positiva fuerte entre credito y cuota, se puede eliminar una
#relación positiva fuerte entre edad y antiguedad_laboral
####Seleccion de variables Escalar-Categorica
number <- cred[,c("edad","IBC","pendeuda","credito","tinteres","capacidad","cuota","antiguedad_laboral","creditos_activos","personas_hogar","ingresosimp","estratoimp")]#redefinir sin NA
fisher_score <- function(escalar, interes) {
media_no_interes <- mean(escalar[interes == 0])
media_interes <- mean(escalar[interes == 1])
# Desviaciones estándar
sd_no_interes <- sd(escalar[interes == 0])
sd_interes <- sd(escalar[interes == 1])
# Cálculo Fisher Score
fisher <- abs(media_no_interes - media_interes) / sqrt(sd_no_interes^2 + sd_interes^2)
return(fisher)
}
scores<-sapply(number, fisher_score, interes=cred$eactual)
scores
## edad IBC pendeuda credito
## 0.045231378 0.262587074 0.050767899 0.003202857
## tinteres capacidad cuota antiguedad_laboral
## 0.099701179 0.038208462 0.042461717 0.152291613
## creditos_activos personas_hogar ingresosimp estratoimp
## 0.267894236 0.247060664 0.017883289 0.151427664
#las variables con Fisher Score < 0,2 son; edad, pendeuda, credito, tinteres, ncuotas, cuota, capacidad, antiguedad_laboral, ingresosimp, estratoimp,
#teniendo en cuenta los resultados de la matriz de correlación y de la prueba Fisher Score se determina eliminar credito, y edad.
#tambien se elimina ingresosimp ya su valor prediccivo es poco significativo
names(cred)
## [1] "number" "dtscore" "dtscore_aux"
## [4] "dtscoreco" "dtscoreco_aux" "edad"
## [7] "IBC" "labor" "calificacion"
## [10] "pendeuda" "co" "credito"
## [13] "tinteres" "ncuotas" "genero"
## [16] "cuota" "capacidad" "antiguedad_laboral"
## [19] "tipo_vivienda" "nivel_educativo" "creditos_activos"
## [22] "personas_hogar" "eactual" "ingresosimp"
## [25] "estratoimp"
cred_back<-cred
cred<- cred[, setdiff(names(cred), c("edad","credito", "ingresosimp"))]
summary(cred)
## number dtscore dtscore_aux dtscoreco dtscoreco_aux
## Min. : 1.0 Min. :276.0 0:728 Min. :150.0 0:313
## 1st Qu.:212.2 1st Qu.:695.0 1: 90 1st Qu.:700.0 1:505
## Median :419.5 Median :732.0 Median :761.0
## Mean :419.4 Mean :710.1 Mean :741.1
## 3rd Qu.:627.8 3rd Qu.:762.0 3rd Qu.:817.0
## Max. :835.0 Max. :883.0 Max. :881.0
## NA's :90 NA's :505
## IBC labor calificacion pendeuda co
## Min. :3.300 EMPLEADO :702 B1:532 Min. :0.0000 0:521
## 1st Qu.:5.300 INDEPENDIENTE:116 C1:286 1st Qu.:0.0000 1:297
## Median :5.900 Median :0.0750
## Mean :6.156 Mean :0.1273
## 3rd Qu.:6.800 3rd Qu.:0.2100
## Max. :9.000 Max. :0.6300
##
## tinteres ncuotas genero cuota
## Min. :0.01600 Min. :12.0 MASCULINO:656 Min. : 126000
## 1st Qu.:0.01870 1st Qu.:36.0 MUJER :162 1st Qu.: 353700
## Median :0.02080 Median :48.0 Median : 422699
## Mean :0.02215 Mean :51.3 Mean : 435790
## 3rd Qu.:0.02500 3rd Qu.:63.0 3rd Qu.: 488142
## Max. :0.03100 Max. :72.0 Max. :2036384
##
## capacidad antiguedad_laboral tipo_vivienda
## Min. :0.0200 Min. : 1.000 arriendo :253
## 1st Qu.:0.1300 1st Qu.: 2.000 propia :196
## Median :0.1800 Median : 5.000 propia_pagando:369
## Mean :0.1851 Mean : 6.589
## 3rd Qu.:0.2300 3rd Qu.: 9.000
## Max. :0.5000 Max. :35.000
##
## nivel_educativo creditos_activos personas_hogar eactual
## basica :165 Min. :1.000 Min. :2.000 0:695
## media :276 1st Qu.:1.000 1st Qu.:3.000 1:123
## superior :157 Median :2.000 Median :3.000
## tecnica/tecnologica:220 Mean :1.762 Mean :3.219
## 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :3.000 Max. :5.000
##
## estratoimp
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.092
## 3rd Qu.:3.000
## Max. :4.000
##
####Seleccion de variables Categorica-Categorica
names(cred)
## [1] "number" "dtscore" "dtscore_aux"
## [4] "dtscoreco" "dtscoreco_aux" "IBC"
## [7] "labor" "calificacion" "pendeuda"
## [10] "co" "tinteres" "ncuotas"
## [13] "genero" "cuota" "capacidad"
## [16] "antiguedad_laboral" "tipo_vivienda" "nivel_educativo"
## [19] "creditos_activos" "personas_hogar" "eactual"
## [22] "estratoimp"
iv <- cred[, c("dtscore_aux","dtscoreco_aux","IBC","labor","co","tinteres","cuota","nivel_educativo","creditos_activos","estratoimp","eactual","genero")]
iv$eactual <- as.numeric(as.character(iv$eactual))
IV <- create_infotables(data = iv, y = "eactual")
print(IV$Summary)
## Variable IV
## 8 nivel_educativo 0.224394979
## 3 IBC 0.176145318
## 6 tinteres 0.065301682
## 7 cuota 0.050987643
## 10 estratoimp 0.049970377
## 11 genero 0.046054475
## 9 creditos_activos 0.033738628
## 5 co 0.015585020
## 4 labor 0.001612934
## 2 dtscoreco_aux 0.001442805
## 1 dtscore_aux 0.000269541
# el criterio establece eliminar IV<0,02, vamos a mantener las siguientes variables: nivel_educativo, IBC
# tambien mantendermo co, estratoimp, genero, tinteres y creditos_activos aún cuando no cumplen con el criterio
names(cred)
## [1] "number" "dtscore" "dtscore_aux"
## [4] "dtscoreco" "dtscoreco_aux" "IBC"
## [7] "labor" "calificacion" "pendeuda"
## [10] "co" "tinteres" "ncuotas"
## [13] "genero" "cuota" "capacidad"
## [16] "antiguedad_laboral" "tipo_vivienda" "nivel_educativo"
## [19] "creditos_activos" "personas_hogar" "eactual"
## [22] "estratoimp"
cred <- cred[, setdiff(names(cred), c("number","dtscore_aux","dtscoreco_aux","labor","calificacion"))]
#resultados
summary(cred)
## dtscore dtscoreco IBC pendeuda co
## Min. :276.0 Min. :150.0 Min. :3.300 Min. :0.0000 0:521
## 1st Qu.:695.0 1st Qu.:700.0 1st Qu.:5.300 1st Qu.:0.0000 1:297
## Median :732.0 Median :761.0 Median :5.900 Median :0.0750
## Mean :710.1 Mean :741.1 Mean :6.156 Mean :0.1273
## 3rd Qu.:762.0 3rd Qu.:817.0 3rd Qu.:6.800 3rd Qu.:0.2100
## Max. :883.0 Max. :881.0 Max. :9.000 Max. :0.6300
## NA's :90 NA's :505
## tinteres ncuotas genero cuota
## Min. :0.01600 Min. :12.0 MASCULINO:656 Min. : 126000
## 1st Qu.:0.01870 1st Qu.:36.0 MUJER :162 1st Qu.: 353700
## Median :0.02080 Median :48.0 Median : 422699
## Mean :0.02215 Mean :51.3 Mean : 435790
## 3rd Qu.:0.02500 3rd Qu.:63.0 3rd Qu.: 488142
## Max. :0.03100 Max. :72.0 Max. :2036384
##
## capacidad antiguedad_laboral tipo_vivienda
## Min. :0.0200 Min. : 1.000 arriendo :253
## 1st Qu.:0.1300 1st Qu.: 2.000 propia :196
## Median :0.1800 Median : 5.000 propia_pagando:369
## Mean :0.1851 Mean : 6.589
## 3rd Qu.:0.2300 3rd Qu.: 9.000
## Max. :0.5000 Max. :35.000
##
## nivel_educativo creditos_activos personas_hogar eactual
## basica :165 Min. :1.000 Min. :2.000 0:695
## media :276 1st Qu.:1.000 1st Qu.:3.000 1:123
## superior :157 Median :2.000 Median :3.000
## tecnica/tecnologica:220 Mean :1.762 Mean :3.219
## 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :3.000 Max. :5.000
##
## estratoimp
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.092
## 3rd Qu.:3.000
## Max. :4.000
##
dim(cred)
## [1] 818 17
mean(as.numeric(as.character(cred$eactual)))
## [1] 0.1503667
##### ----------- ooo -- ooo --------- #######
####Construccion de modelos
#Separación de la base
names(cred)
## [1] "dtscore" "dtscoreco" "IBC"
## [4] "pendeuda" "co" "tinteres"
## [7] "ncuotas" "genero" "cuota"
## [10] "capacidad" "antiguedad_laboral" "tipo_vivienda"
## [13] "nivel_educativo" "creditos_activos" "personas_hogar"
## [16] "eactual" "estratoimp"
cred2 <- cred[, !(names(cred) %in% c("dtscore", "dtscoreco"))]
cred_train <- sample_frac(cred2, 0.75)
cred_test <- setdiff(cred2, cred_train)
dim(cred_train)
## [1] 614 15
dim(cred_test)
## [1] 204 15
#Modelo logistico con todas las varaibles
M1<-glm(eactual ~ .-eactual, data = cred_train, family = binomial(link="logit"))
summary(M1)
##
## Call:
## glm(formula = eactual ~ . - eactual, family = binomial(link = "logit"),
## data = cred_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.406e+00 1.714e+00 -1.988 0.0469 *
## IBC -1.899e-01 1.612e-01 -1.178 0.2388
## pendeuda -9.526e-01 9.715e-01 -0.981 0.3268
## co1 -1.254e-01 2.727e-01 -0.460 0.6457
## tinteres 1.089e+01 3.267e+01 0.333 0.7389
## ncuotas -5.949e-03 7.980e-03 -0.745 0.4560
## generoMUJER -5.241e-01 3.606e-01 -1.453 0.1462
## cuota 2.471e-07 7.091e-07 0.349 0.7274
## capacidad 2.409e-02 1.809e+00 0.013 0.9894
## antiguedad_laboral -9.697e-05 5.005e-02 -0.002 0.9985
## tipo_viviendapropia 1.547e+00 6.246e-01 2.477 0.0133 *
## tipo_viviendapropia_pagando -1.048e-02 3.739e-01 -0.028 0.9776
## nivel_educativomedia -2.587e-01 4.472e-01 -0.578 0.5630
## nivel_educativosuperior -7.451e-01 7.361e-01 -1.012 0.3115
## nivel_educativotecnica/tecnologica -9.484e-01 5.108e-01 -1.857 0.0633 .
## creditos_activos 9.555e-01 3.603e-01 2.652 0.0080 **
## personas_hogar 3.274e-01 2.548e-01 1.285 0.1987
## estratoimp 5.988e-02 2.986e-01 0.201 0.8411
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 486.43 on 613 degrees of freedom
## Residual deviance: 442.88 on 596 degrees of freedom
## AIC: 478.88
##
## Number of Fisher Scoring iterations: 5
#Modelo logistico con menos varaibles
anova(M1,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: eactual
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 613 486.43
## IBC 1 9.9104 612 476.52 0.001643 **
## pendeuda 1 0.6475 611 475.87 0.421016
## co 1 0.0178 610 475.85 0.893885
## tinteres 1 0.8475 609 475.01 0.357267
## ncuotas 1 0.9155 608 474.09 0.338668
## genero 1 2.3414 607 471.75 0.125973
## cuota 1 1.2609 606 470.49 0.261490
## capacidad 1 0.0466 605 470.44 0.829121
## antiguedad_laboral 1 0.2223 604 470.22 0.637258
## tipo_vivienda 2 3.6981 602 466.52 0.157389
## nivel_educativo 3 15.1817 599 451.34 0.001668 **
## creditos_activos 1 6.6494 598 444.69 0.009919 **
## personas_hogar 1 1.7710 597 442.92 0.183253
## estratoimp 1 0.0401 596 442.88 0.841225
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
M2<-glm(eactual ~ genero+co+tinteres+tipo_vivienda+nivel_educativo+IBC, data = cred_train, family = binomial(link="logit"))
summary(M2)
##
## Call:
## glm(formula = eactual ~ genero + co + tinteres + tipo_vivienda +
## nivel_educativo + IBC, family = binomial(link = "logit"),
## data = cred_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4310 1.1206 -0.385 0.700550
## generoMUJER -0.5119 0.3505 -1.460 0.144204
## co1 -0.1462 0.2651 -0.551 0.581433
## tinteres 24.9997 29.4231 0.850 0.395513
## tipo_viviendapropia 1.1416 0.4335 2.633 0.008459 **
## tipo_viviendapropia_pagando 0.3332 0.3111 1.071 0.284133
## nivel_educativomedia -0.8334 0.3162 -2.635 0.008409 **
## nivel_educativosuperior -1.6989 0.5338 -3.183 0.001458 **
## nivel_educativotecnica/tecnologica -1.5433 0.3982 -3.876 0.000106 ***
## IBC -0.2221 0.1560 -1.423 0.154620
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 486.43 on 613 degrees of freedom
## Residual deviance: 453.16 on 604 degrees of freedom
## AIC: 473.16
##
## Number of Fisher Scoring iterations: 5
#Modelo logistico con 3 variables
M3<-glm(eactual ~ genero+nivel_educativo+tipo_vivienda, data = cred_train, family = binomial(link="logit"))
summary(M3)
##
## Call:
## glm(formula = eactual ~ genero + nivel_educativo + tipo_vivienda,
## family = binomial(link = "logit"), data = cred_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1666 0.2846 -4.099 4.15e-05 ***
## generoMUJER -0.5306 0.3483 -1.523 0.12767
## nivel_educativomedia -0.9047 0.3057 -2.959 0.00308 **
## nivel_educativosuperior -2.0396 0.4780 -4.267 1.98e-05 ***
## nivel_educativotecnica/tecnologica -1.6875 0.3802 -4.438 9.06e-06 ***
## tipo_viviendapropia 1.1771 0.4279 2.751 0.00594 **
## tipo_viviendapropia_pagando 0.3575 0.3066 1.166 0.24357
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 486.43 on 613 degrees of freedom
## Residual deviance: 456.16 on 607 degrees of freedom
## AIC: 470.16
##
## Number of Fisher Scoring iterations: 5
#Random Forests
M4 <- randomForest(eactual ~ ., data = cred_train, proximity=TRUE,ntree = 1000, mtry = 3, importance = TRUE)
names(cred_train)
## [1] "IBC" "pendeuda" "co"
## [4] "tinteres" "ncuotas" "genero"
## [7] "cuota" "capacidad" "antiguedad_laboral"
## [10] "tipo_vivienda" "nivel_educativo" "creditos_activos"
## [13] "personas_hogar" "eactual" "estratoimp"
#a partir de acá convertiremos las categoricas en numericas menos eactual
cred_train_num <- cred_train
cred_train_num$co <- as.numeric(cred_train$co)-1
cred_train_num$genero <- as.numeric(cred_train$genero) - 1
cred_train_num$nivel_educativo <- as.numeric(cred_train$nivel_educativo) - 1
cred_train_num$tipo_vivienda <- as.numeric(cred_train$tipo_vivienda) - 1
cred_test_num <- cred_test
cred_test_num$co <- as.numeric(cred_test$co)-1
cred_test_num$genero <- as.numeric(cred_test$genero) - 1
cred_test_num$nivel_educativo <- as.numeric(cred_test$nivel_educativo) - 1
cred_test_num$tipo_vivienda <- as.numeric(cred_test$tipo_vivienda) - 1
# Support Vector Machines
M5 <- svm(eactual ~ ., data = cred_train_num, scale = TRUE, kernel = "radial", cost = 10,probability = TRUE)
summary(M5)
##
## Call:
## svm(formula = eactual ~ ., data = cred_train_num, kernel = "radial",
## cost = 10, probability = TRUE, scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 292
##
## ( 210 82 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
M6 = ksvm(eactual ~ ., data = cred_train_num, scale = TRUE, kernel = "rbfdot",prob.model = TRUE)
summary(M6)
## Length Class Mode
## 1 ksvm S4
#a partir de acá convertiremos las categoricas en numericas incluida eactual
cred_train_num_2 <- cred_train
cred_train_num_2$co <- as.numeric(cred_train$co)-1
cred_train_num_2$genero <- as.numeric(cred_train$genero) - 1
cred_train_num_2$tipo_vivienda <- as.numeric(cred_train$tipo_vivienda) - 1
cred_train_num_2$nivel_educativo <- as.numeric(cred_train$nivel_educativo) - 1
cred_train_num_2$eactual <- as.numeric(cred_train$eactual) - 1
cred_test_num_2 <- cred_test
cred_test_num_2$co <- as.numeric(cred_test$co)-1
cred_test_num_2$genero <- as.numeric(cred_test$genero) - 1
cred_test_num_2$tipo_vivienda <- as.numeric(cred_test$tipo_vivienda) - 1
cred_test_num_2$nivel_educativo <- as.numeric(cred_test$nivel_educativo) - 1
cred_test_num_2$eactual <- as.numeric(cred_test$eactual) - 1
#escalado train
x_train_num_2 <- cred_train_num_2[, setdiff(names(cred_train_num_2), "eactual")]
y_train_num_2 <- cred_train_num_2$eactual
x_train_scaled <- as.data.frame(scale(x_train_num_2))
cred_train_scaled <- cbind(x_train_scaled, eactual = y_train_num_2)
summary(cred_train_scaled)
## IBC pendeuda co tinteres
## Min. :-2.9198 Min. :-0.8848 Min. :-0.7361 Min. :-1.4955
## 1st Qu.:-0.8785 1st Qu.:-0.8848 1st Qu.:-0.7361 1st Qu.:-0.8288
## Median :-0.2661 Median :-0.3352 Median :-0.7361 Median :-0.3102
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6525 3rd Qu.: 0.5579 3rd Qu.: 1.3563 3rd Qu.: 0.7270
## Max. : 2.8979 Max. : 3.4431 Max. : 1.3563 Max. : 2.2087
## ncuotas genero cuota capacidad
## Min. :-2.4198 Min. :-0.5026 Min. :-1.75664 Min. :-2.10157
## 1st Qu.:-0.9335 1st Qu.:-0.5026 1st Qu.:-0.46014 1st Qu.:-0.69539
## Median :-0.1904 Median :-0.5026 Median :-0.09435 Median :-0.05621
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.7230 3rd Qu.:-0.5026 3rd Qu.: 0.26400 3rd Qu.: 0.58296
## Max. : 1.2958 Max. : 1.9862 Max. : 8.90272 Max. : 4.03450
## antiguedad_laboral tipo_vivienda nivel_educativo creditos_activos
## Min. :-0.9837 Min. :-1.3358 Min. :-1.4379 Min. :-1.3211
## 1st Qu.:-0.8059 1st Qu.:-1.3358 1st Qu.:-0.5125 1st Qu.:-1.3211
## Median :-0.2725 Median :-0.1741 Median :-0.5125 Median : 0.4196
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4387 3rd Qu.: 0.9876 3rd Qu.: 1.3384 3rd Qu.: 0.4196
## Max. : 5.0618 Max. : 0.9876 Max. : 1.3384 Max. : 2.1602
## personas_hogar estratoimp eactual
## Min. :-1.7061 Min. :-1.0499 Min. :0.0000
## 1st Qu.:-0.2924 1st Qu.:-1.0499 1st Qu.:0.0000
## Median :-0.2924 Median :-0.1005 Median :0.0000
## Mean : 0.0000 Mean : 0.0000 Mean :0.1352
## 3rd Qu.: 1.1213 3rd Qu.: 0.8489 3rd Qu.:0.0000
## Max. : 2.5350 Max. : 1.7982 Max. :1.0000
#escalado test
# Guardamos media y desviación de cada columna
train_means <- apply(x_train_num_2, 2, mean)
train_sds <- apply(x_train_num_2, 2, sd)
x_test_num_2 <- cred_test_num_2[, setdiff(names(cred_test_num_2), "eactual")]
x_test_scaled <- as.data.frame(scale(x_test_num_2, center = train_means, scale = train_sds))
cred_test_scaled <- cbind(x_test_scaled, eactual = cred_test_num_2$eactual)
# RED
M7<- nnet(eactual ~ ., data = cred_train_scaled, size = 5, maxit = 500, decay = 0.01)
## # weights: 81
## initial value 161.731117
## iter 10 value 77.392999
## iter 20 value 58.232840
## iter 30 value 53.025874
## iter 40 value 50.577206
## iter 50 value 48.287445
## iter 60 value 47.009508
## iter 70 value 45.760207
## iter 80 value 43.982745
## iter 90 value 43.295489
## iter 100 value 42.887495
## iter 110 value 42.647797
## iter 120 value 42.606122
## iter 130 value 42.599790
## iter 140 value 42.599499
## final value 42.599495
## converged
summary(M7)
## a 14-5-1 network with 81 weights
## options were - decay=0.01
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## -2.29 2.40 0.19 4.58 -1.56 -3.40 1.49 -0.24 -1.64 -2.14
## i10->h1 i11->h1 i12->h1 i13->h1 i14->h1
## -0.57 -0.64 2.58 -0.53 2.19
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## -3.04 -0.62 0.51 -1.48 0.10 -1.81 1.74 1.81 -2.24 -5.36
## i10->h2 i11->h2 i12->h2 i13->h2 i14->h2
## 0.13 0.29 -1.93 0.72 0.62
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## 4.19 0.39 1.47 -1.15 -2.32 5.93 -3.66 -2.20 1.34 3.65
## i10->h3 i11->h3 i12->h3 i13->h3 i14->h3
## 1.72 1.62 -1.16 -1.63 -2.23
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## -5.40 -1.27 2.09 1.80 0.57 -1.04 -2.45 2.15 -3.44 -4.22
## i10->h4 i11->h4 i12->h4 i13->h4 i14->h4
## 0.53 -0.67 -2.09 0.05 0.83
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## -2.02 3.15 0.76 0.86 -2.88 -5.06 2.56 0.09 -2.75 -3.45
## i10->h5 i11->h5 i12->h5 i13->h5 i14->h5
## 0.61 -1.87 2.93 0.23 1.42
## b->o h1->o h2->o h3->o h4->o h5->o
## 3.88 -10.18 -10.90 -6.70 6.03 5.25
#Xgboost
x_train <- as.matrix(cred_train_scaled[, setdiff(names(cred_train_scaled), "eactual")])
x_test <- as.matrix(cred_test_scaled[, setdiff(names(cred_test_scaled), "eactual")])
y_train <- as.numeric(cred_train_scaled$eactual)
y_test <- as.numeric(cred_test_scaled$eactual)
M8 <- xgboost(data = x_train, label = y_train, objective = "binary:logistic",eval_metric = "error",nrounds = 100, eta = 0.1,max_depth = 3,verbose = 1)
## [1] train-error:0.125407
## [2] train-error:0.125407
## [3] train-error:0.125407
## [4] train-error:0.125407
## [5] train-error:0.123779
## [6] train-error:0.118893
## [7] train-error:0.117264
## [8] train-error:0.115635
## [9] train-error:0.123779
## [10] train-error:0.123779
## [11] train-error:0.122150
## [12] train-error:0.122150
## [13] train-error:0.120521
## [14] train-error:0.122150
## [15] train-error:0.120521
## [16] train-error:0.120521
## [17] train-error:0.120521
## [18] train-error:0.120521
## [19] train-error:0.120521
## [20] train-error:0.120521
## [21] train-error:0.122150
## [22] train-error:0.122150
## [23] train-error:0.122150
## [24] train-error:0.122150
## [25] train-error:0.122150
## [26] train-error:0.122150
## [27] train-error:0.123779
## [28] train-error:0.123779
## [29] train-error:0.118893
## [30] train-error:0.118893
## [31] train-error:0.120521
## [32] train-error:0.120521
## [33] train-error:0.120521
## [34] train-error:0.118893
## [35] train-error:0.118893
## [36] train-error:0.118893
## [37] train-error:0.120521
## [38] train-error:0.120521
## [39] train-error:0.120521
## [40] train-error:0.120521
## [41] train-error:0.120521
## [42] train-error:0.120521
## [43] train-error:0.120521
## [44] train-error:0.120521
## [45] train-error:0.118893
## [46] train-error:0.117264
## [47] train-error:0.118893
## [48] train-error:0.118893
## [49] train-error:0.115635
## [50] train-error:0.115635
## [51] train-error:0.117264
## [52] train-error:0.117264
## [53] train-error:0.117264
## [54] train-error:0.117264
## [55] train-error:0.117264
## [56] train-error:0.115635
## [57] train-error:0.115635
## [58] train-error:0.115635
## [59] train-error:0.115635
## [60] train-error:0.115635
## [61] train-error:0.115635
## [62] train-error:0.115635
## [63] train-error:0.115635
## [64] train-error:0.114007
## [65] train-error:0.114007
## [66] train-error:0.112378
## [67] train-error:0.112378
## [68] train-error:0.112378
## [69] train-error:0.112378
## [70] train-error:0.112378
## [71] train-error:0.112378
## [72] train-error:0.112378
## [73] train-error:0.112378
## [74] train-error:0.114007
## [75] train-error:0.114007
## [76] train-error:0.112378
## [77] train-error:0.114007
## [78] train-error:0.112378
## [79] train-error:0.112378
## [80] train-error:0.112378
## [81] train-error:0.112378
## [82] train-error:0.112378
## [83] train-error:0.110749
## [84] train-error:0.110749
## [85] train-error:0.110749
## [86] train-error:0.107492
## [87] train-error:0.107492
## [88] train-error:0.112378
## [89] train-error:0.112378
## [90] train-error:0.105863
## [91] train-error:0.107492
## [92] train-error:0.107492
## [93] train-error:0.109121
## [94] train-error:0.109121
## [95] train-error:0.109121
## [96] train-error:0.109121
## [97] train-error:0.107492
## [98] train-error:0.105863
## [99] train-error:0.105863
## [100] train-error:0.104235
##### ----------- ooo -- ooo --------- #######
####Comparacion de modelos
pred1 <- predict(M1, cred_test, type = "response")
pred2 <- predict(M2, cred_test, type = "response")
pred3 <- predict(M3, cred_test, type = "response")
pred4 <- predict(M4, cred_test, type = "prob")[,2]
pred5 <- attr(predict(M5, cred_test_num, probability = TRUE), "probabilities")[,2]
pred6 <- predict(M6, cred_test_num, type = "probabilities")[,2]
pred7 <- as.numeric(predict(M7, cred_test_scaled))
pred8 <- predict(M8, x_test)
y_real <- cred_test$eactual
data.frame(
Modelo = c("M1 GLM completo",
"M2 GLM reducido",
"M3 GLM mínimo",
"M4 Random Forest",
"M5 SVM e1071",
"M6 SVM kernlab",
"M7 Red Neuronal",
"M8 XGBoost"),
AUC = round(c(
auc(roc(y_real, pred1)),
auc(roc(y_real, pred2)),
auc(roc(y_real, pred3)),
auc(roc(y_real, pred4)),
auc(roc(y_real, pred5)),
auc(roc(y_real, pred6)),
auc(roc(y_real, pred7)),
auc(roc(y_real, pred8))
), 4)
) %>% arrange(desc(AUC)) %>% print()
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Modelo AUC
## 1 M2 GLM reducido 0.5892
## 2 M4 Random Forest 0.5796
## 3 M7 Red Neuronal 0.5782
## 4 M6 SVM kernlab 0.5736
## 5 M3 GLM mínimo 0.5729
## 6 M1 GLM completo 0.5718
## 7 M5 SVM e1071 0.5704
## 8 M8 XGBoost 0.5691
prop.table(table(cred$tipo_vivienda)) * 100
##
## arriendo propia propia_pagando
## 30.92910 23.96088 45.11002
prob_test <- predict(M2, cred_test, type = "response")
roc <- roc(cred_test$eactual, prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
cut <- coords(roc, "best", ret = "threshold")[[1]]
cut
## [1] 0.131899
pred_column <- ifelse(prob_test >= cut, 1, 0)
table(Predicho = pred_column, Real = cred_test$eactual)
## Real
## Predicho 0 1
## 0 103 15
## 1 61 25
#accuracy
print((103+25)/(103+25+61+15))
## [1] 0.627451
#back test
cred_back<- cred_back[, setdiff(names(cred_back), c("edad", "ingresosimp"))]
cred_back <- cred_back[, setdiff(names(cred_back), c("number","dtscore_aux","dtscoreco_aux","labor","calificacion"))]
summary(cred_back)
## dtscore dtscoreco IBC pendeuda co
## Min. :276.0 Min. :150.0 Min. :3.300 Min. :0.0000 0:521
## 1st Qu.:695.0 1st Qu.:700.0 1st Qu.:5.300 1st Qu.:0.0000 1:297
## Median :732.0 Median :761.0 Median :5.900 Median :0.0750
## Mean :710.1 Mean :741.1 Mean :6.156 Mean :0.1273
## 3rd Qu.:762.0 3rd Qu.:817.0 3rd Qu.:6.800 3rd Qu.:0.2100
## Max. :883.0 Max. :881.0 Max. :9.000 Max. :0.6300
## NA's :90 NA's :505
## credito tinteres ncuotas genero
## Min. : 3803300 Min. :0.01600 Min. :12.0 MASCULINO:656
## 1st Qu.:10045000 1st Qu.:0.01870 1st Qu.:36.0 MUJER :162
## Median :11954000 Median :0.02080 Median :48.0
## Mean :12237124 Mean :0.02215 Mean :51.3
## 3rd Qu.:14045000 3rd Qu.:0.02500 3rd Qu.:63.0
## Max. :60045000 Max. :0.03100 Max. :72.0
##
## cuota capacidad antiguedad_laboral tipo_vivienda
## Min. : 126000 Min. :0.0200 Min. : 1.000 arriendo :253
## 1st Qu.: 353700 1st Qu.:0.1300 1st Qu.: 2.000 propia :196
## Median : 422699 Median :0.1800 Median : 5.000 propia_pagando:369
## Mean : 435790 Mean :0.1851 Mean : 6.589
## 3rd Qu.: 488142 3rd Qu.:0.2300 3rd Qu.: 9.000
## Max. :2036384 Max. :0.5000 Max. :35.000
##
## nivel_educativo creditos_activos personas_hogar eactual
## basica :165 Min. :1.000 Min. :2.000 0:695
## media :276 1st Qu.:1.000 1st Qu.:3.000 1:123
## superior :157 Median :2.000 Median :3.000
## tecnica/tecnologica:220 Mean :1.762 Mean :3.219
## 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :3.000 Max. :5.000
##
## estratoimp
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.092
## 3rd Qu.:3.000
## Max. :4.000
##
cred_back$prob_back <- predict(M2, cred_back, type = "response")
roc <- roc(cred_back$eactual, cred_back$prob_back)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
cut <- coords(roc, "best", ret = "threshold")[[1]]
cut
## [1] 0.1266185
cred_back$pred_column_back <- ifelse(cred_back$prob_back >= cut, 1, 0)
cred_back$result_estimado<-cred_back$pred_column_back*cred_back$credito
cred_back$eactual<-as.numeric(cred_back$eactual)-1
cred_back$result_real<-cred_back$eactual*cred_back$credito
dim(cred_back)
## [1] 818 22
sum(cred_back$eactual)
## [1] 123
sum(cred_back$result_real)
## [1] 1503272657
sum(cred_back$pred_column_back)
## [1] 335
sum(cred_back$result_estimado)
## [1] 3961522472
q4 <- quantile(cred_back$prob_back, 0.75, na.rm = TRUE)
q4
## 75%
## 0.1678094
cred_back$pred_column_back_Q4 <- ifelse(cred_back$prob_back >= q4, 1, 0)
cred_back$result_estimado_Q4<-cred_back$pred_column_back_Q4*cred_back$credito
sum(cred_back$pred_column_back_Q4)
## [1] 205
sum(cred_back$result_estimado_Q4)
## [1] 2440755770
q5 <- quantile(cred_back$prob_back, 0.80, na.rm = TRUE)
q5
## 80%
## 0.198052
cred_back$pred_column_back_Q5 <- ifelse(cred_back$prob_back >= q5, 1, 0)
cred_back$result_estimado_Q5<-cred_back$pred_column_back_Q5*cred_back$credito
sum(cred_back$pred_column_back_Q5)
## [1] 164
sum(cred_back$result_estimado_Q5)
## [1] 1900642135
q6 <- quantile(cred_back$prob_back, 0.90, na.rm = TRUE)
q6
## 90%
## 0.2789369
cred_back$pred_column_back_Q6 <- ifelse(cred_back$prob_back >= q6, 1, 0)
cred_back$result_estimado_Q6<-cred_back$pred_column_back_Q6*cred_back$credito
sum(cred_back$pred_column_back_Q6)
## [1] 82
sum(cred_back$result_estimado_Q6)
## [1] 979166188
summary(cred_back)
## dtscore dtscoreco IBC pendeuda co
## Min. :276.0 Min. :150.0 Min. :3.300 Min. :0.0000 0:521
## 1st Qu.:695.0 1st Qu.:700.0 1st Qu.:5.300 1st Qu.:0.0000 1:297
## Median :732.0 Median :761.0 Median :5.900 Median :0.0750
## Mean :710.1 Mean :741.1 Mean :6.156 Mean :0.1273
## 3rd Qu.:762.0 3rd Qu.:817.0 3rd Qu.:6.800 3rd Qu.:0.2100
## Max. :883.0 Max. :881.0 Max. :9.000 Max. :0.6300
## NA's :90 NA's :505
## credito tinteres ncuotas genero
## Min. : 3803300 Min. :0.01600 Min. :12.0 MASCULINO:656
## 1st Qu.:10045000 1st Qu.:0.01870 1st Qu.:36.0 MUJER :162
## Median :11954000 Median :0.02080 Median :48.0
## Mean :12237124 Mean :0.02215 Mean :51.3
## 3rd Qu.:14045000 3rd Qu.:0.02500 3rd Qu.:63.0
## Max. :60045000 Max. :0.03100 Max. :72.0
##
## cuota capacidad antiguedad_laboral tipo_vivienda
## Min. : 126000 Min. :0.0200 Min. : 1.000 arriendo :253
## 1st Qu.: 353700 1st Qu.:0.1300 1st Qu.: 2.000 propia :196
## Median : 422699 Median :0.1800 Median : 5.000 propia_pagando:369
## Mean : 435790 Mean :0.1851 Mean : 6.589
## 3rd Qu.: 488142 3rd Qu.:0.2300 3rd Qu.: 9.000
## Max. :2036384 Max. :0.5000 Max. :35.000
##
## nivel_educativo creditos_activos personas_hogar eactual
## basica :165 Min. :1.000 Min. :2.000 Min. :0.0000
## media :276 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:0.0000
## superior :157 Median :2.000 Median :3.000 Median :0.0000
## tecnica/tecnologica:220 Mean :1.762 Mean :3.219 Mean :0.1504
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:0.0000
## Max. :3.000 Max. :5.000 Max. :1.0000
##
## estratoimp prob_back pred_column_back result_estimado
## Min. :1.000 Min. :0.02346 Min. :0.0000 Min. : 0
## 1st Qu.:1.000 1st Qu.:0.07654 1st Qu.:0.0000 1st Qu.: 0
## Median :2.000 Median :0.11382 Median :0.0000 Median : 0
## Mean :2.092 Mean :0.13772 Mean :0.4095 Mean : 4842937
## 3rd Qu.:3.000 3rd Qu.:0.16781 3rd Qu.:1.0000 3rd Qu.:10931250
## Max. :4.000 Max. :0.53129 Max. :1.0000 Max. :60045000
##
## result_real pred_column_back_Q4 result_estimado_Q4 pred_column_back_Q5
## Min. : 0 Min. :0.0000 Min. : 0 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0.0000
## Median : 0 Median :0.0000 Median : 0 Median :0.0000
## Mean : 1837742 Mean :0.2506 Mean : 2983809 Mean :0.2005
## 3rd Qu.: 0 3rd Qu.:0.7500 3rd Qu.: 2852475 3rd Qu.:0.0000
## Max. :19445000 Max. :1.0000 Max. :60045000 Max. :1.0000
##
## result_estimado_Q5 pred_column_back_Q6 result_estimado_Q6
## Min. : 0 Min. :0.0000 Min. : 0
## 1st Qu.: 0 1st Qu.:0.0000 1st Qu.: 0
## Median : 0 Median :0.0000 Median : 0
## Mean : 2323523 Mean :0.1002 Mean : 1197025
## 3rd Qu.: 0 3rd Qu.:0.0000 3rd Qu.: 0
## Max. :58237839 Max. :1.0000 Max. :58237839
##