Este archivo markdown es una recreación del enlace https://rpubs.com/chidungkt/442168, con algunas modificaciones propias del autor.

Las librerías

library(tidyverse) # Varios paquetes entre ellos dplyr y pylr
library(readr) # Leer datos 
library(ggplot2) # Gráficos
library(knitr) # PAra visuaiar tablas mas amigables
library(caret) # Para particionar datos

Los datos

datos <- read.csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")
kable(head(datos))
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
1 1100 25860 39025 HomeImp Other 10.5 0 0 94.36667 1 9 NA
1 1300 70053 68400 HomeImp Other 7.0 0 2 121.83333 0 14 NA
1 1500 13500 16700 HomeImp Other 4.0 0 0 149.46667 1 10 NA
1 1500 NA NA NA NA NA NA NA NA NA
0 1700 97800 112000 HomeImp Office 3.0 0 0 93.33333 0 14 NA
1 1700 30548 40320 HomeImp Other 9.0 0 0 101.46600 1 8 37.11361
kable(tail(datos))
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
5955 0 88900 48919 93371 DebtCon Other 15 0 1 205.6502 0 15 34.81826
5956 0 88900 57264 90185 DebtCon Other 16 0 0 221.8087 0 16 36.11235
5957 0 89000 54576 92937 DebtCon Other 16 0 0 208.6921 0 15 35.85997
5958 0 89200 54045 92924 DebtCon Other 15 0 0 212.2797 0 15 35.55659
5959 0 89800 50370 91861 DebtCon Other 14 0 0 213.8927 0 16 34.34088
5960 0 89900 48811 88934 DebtCon Other 15 0 0 219.6010 0 16 34.57152

Las variables de los datos

str(datos)
## 'data.frame':    5960 obs. of  13 variables:
##  $ BAD    : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ LOAN   : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
##  $ MORTDUE: num  25860 70053 13500 NA 97800 ...
##  $ VALUE  : num  39025 68400 16700 NA 112000 ...
##  $ REASON : Factor w/ 3 levels "","DebtCon","HomeImp": 3 3 3 1 3 3 3 3 3 3 ...
##  $ JOB    : Factor w/ 7 levels "","Mgr","Office",..: 4 4 4 1 3 4 4 4 4 6 ...
##  $ YOJ    : num  10.5 7 4 NA 3 9 5 11 3 16 ...
##  $ DEROG  : int  0 0 0 NA 0 0 3 0 0 0 ...
##  $ DELINQ : int  0 2 0 NA 0 0 2 0 2 0 ...
##  $ CLAGE  : num  94.4 121.8 149.5 NA 93.3 ...
##  $ NINQ   : int  1 0 1 NA 0 1 1 0 1 0 ...
##  $ CLNO   : int  9 14 10 NA 14 8 17 8 12 13 ...
##  $ DEBTINC: num  NA NA NA NA NA ...
summary(datos)
##       BAD              LOAN          MORTDUE           VALUE       
##  Min.   :0.0000   Min.   : 1100   Min.   :  2063   Min.   :  8000  
##  1st Qu.:0.0000   1st Qu.:11100   1st Qu.: 46276   1st Qu.: 66076  
##  Median :0.0000   Median :16300   Median : 65019   Median : 89236  
##  Mean   :0.1995   Mean   :18608   Mean   : 73761   Mean   :101776  
##  3rd Qu.:0.0000   3rd Qu.:23300   3rd Qu.: 91488   3rd Qu.:119824  
##  Max.   :1.0000   Max.   :89900   Max.   :399550   Max.   :855909  
##                                   NA's   :518      NA's   :112     
##      REASON          JOB            YOJ             DEROG        
##         : 252          : 279   Min.   : 0.000   Min.   : 0.0000  
##  DebtCon:3928   Mgr    : 767   1st Qu.: 3.000   1st Qu.: 0.0000  
##  HomeImp:1780   Office : 948   Median : 7.000   Median : 0.0000  
##                 Other  :2388   Mean   : 8.922   Mean   : 0.2546  
##                 ProfExe:1276   3rd Qu.:13.000   3rd Qu.: 0.0000  
##                 Sales  : 109   Max.   :41.000   Max.   :10.0000  
##                 Self   : 193   NA's   :515      NA's   :708      
##      DELINQ            CLAGE             NINQ             CLNO     
##  Min.   : 0.0000   Min.   :   0.0   Min.   : 0.000   Min.   : 0.0  
##  1st Qu.: 0.0000   1st Qu.: 115.1   1st Qu.: 0.000   1st Qu.:15.0  
##  Median : 0.0000   Median : 173.5   Median : 1.000   Median :20.0  
##  Mean   : 0.4494   Mean   : 179.8   Mean   : 1.186   Mean   :21.3  
##  3rd Qu.: 0.0000   3rd Qu.: 231.6   3rd Qu.: 2.000   3rd Qu.:26.0  
##  Max.   :15.0000   Max.   :1168.2   Max.   :17.000   Max.   :71.0  
##  NA's   :580       NA's   :308      NA's   :510      NA's   :222   
##     DEBTINC        
##  Min.   :  0.5245  
##  1st Qu.: 29.1400  
##  Median : 34.8183  
##  Mean   : 33.7799  
##  3rd Qu.: 39.0031  
##  Max.   :203.3121  
##  NA's   :1267

Funciones a utilizar

# Function replaces NA by mean: 
replace_by_mean <- function(x) {
  x[is.na(x)] <- mean(x, na.rm = TRUE)
  return(x)
}

# A function imputes NA observations for categorical variables: 

replace_na_categorical <- function(x) {
  x %>% 
    table() %>% 
    as.data.frame() %>% 
    arrange(-Freq) ->> my_df
  
  n_obs <- sum(my_df$Freq)
  pop <- my_df$. %>% as.character()
  set.seed(29)
  x[is.na(x)] <- sample(pop, sum(is.na(x)), replace = TRUE, prob = my_df$Freq)
  return(x)
}

Limpiando y transformando datos

datosLimpios <- datos %>% 
  mutate_if(is.factor, as.character) %>% 
  mutate(REASON = case_when(REASON == "" ~ NA_character_, TRUE ~ REASON), 
         JOB = case_when(JOB == "" ~ NA_character_, TRUE ~ JOB)) %>%
  mutate_if(is_character, as.factor) %>% 
  mutate_if(is.numeric, replace_by_mean) %>% 
  mutate_if(is.factor,replace_na_categorical)

kable(head(datosLimpios, 30))
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
1 1100 25860.00 39025 HomeImp Other 10.500000 0.0000000 0.0000000 94.36667 1.000000 9.0000 33.779915
1 1300 70053.00 68400 HomeImp Other 7.000000 0.0000000 2.0000000 121.83333 0.000000 14.0000 33.779915
1 1500 13500.00 16700 HomeImp Other 4.000000 0.0000000 0.0000000 149.46667 1.000000 10.0000 33.779915
1 1500 73760.82 101776 DebtCon Other 8.922268 0.2545697 0.4494424 179.76628 1.186055 21.2961 33.779915
0 1700 97800.00 112000 HomeImp Office 3.000000 0.0000000 0.0000000 93.33333 0.000000 14.0000 33.779915
1 1700 30548.00 40320 HomeImp Other 9.000000 0.0000000 0.0000000 101.46600 1.000000 8.0000 37.113614
1 1800 48649.00 57037 HomeImp Other 5.000000 3.0000000 2.0000000 77.10000 1.000000 17.0000 33.779915
1 1800 28502.00 43034 HomeImp Other 11.000000 0.0000000 0.0000000 88.76603 0.000000 8.0000 36.884894
1 2000 32700.00 46740 HomeImp Other 3.000000 0.0000000 2.0000000 216.93333 1.000000 12.0000 33.779915
1 2000 73760.82 62250 HomeImp Sales 16.000000 0.0000000 0.0000000 115.80000 0.000000 13.0000 33.779915
1 2000 22608.00 101776 DebtCon Other 18.000000 0.2545697 0.4494424 179.76628 1.186055 21.2961 33.779915
1 2000 20627.00 29800 HomeImp Office 11.000000 0.0000000 1.0000000 122.53333 1.000000 9.0000 33.779915
1 2000 45000.00 55000 HomeImp Other 3.000000 0.0000000 0.0000000 86.06667 2.000000 25.0000 33.779915
0 2000 64536.00 87400 DebtCon Mgr 2.500000 0.0000000 0.0000000 147.13333 0.000000 24.0000 33.779915
1 2100 71000.00 83850 HomeImp Other 8.000000 0.0000000 1.0000000 123.00000 0.000000 16.0000 33.779915
1 2200 24280.00 34687 HomeImp Other 8.922268 0.0000000 1.0000000 300.86667 0.000000 8.0000 33.779915
1 2200 90957.00 102600 HomeImp Mgr 7.000000 2.0000000 6.0000000 122.90000 1.000000 22.0000 33.779915
1 2200 23030.00 101776 DebtCon Other 19.000000 0.2545697 0.4494424 179.76628 1.186055 21.2961 3.711312
1 2300 28192.00 40150 HomeImp Other 4.500000 0.0000000 0.0000000 54.60000 1.000000 16.0000 33.779915
0 2300 102370.00 120953 HomeImp Office 2.000000 0.0000000 0.0000000 90.99253 0.000000 13.0000 31.588503
1 2300 37626.00 46200 HomeImp Other 3.000000 0.0000000 1.0000000 122.26667 1.000000 14.0000 33.779915
1 2400 50000.00 73395 HomeImp ProfExe 5.000000 1.0000000 0.0000000 179.76628 1.000000 0.0000 33.779915
1 2400 28000.00 40800 HomeImp Mgr 12.000000 0.0000000 0.0000000 67.20000 2.000000 22.0000 33.779915
1 2400 18000.00 101776 HomeImp Mgr 22.000000 0.2545697 2.0000000 121.73333 0.000000 10.0000 33.779915
1 2400 73760.82 17180 HomeImp Other 8.922268 0.0000000 0.0000000 14.56667 3.000000 4.0000 33.779915
1 2400 34863.00 47471 HomeImp Mgr 12.000000 0.0000000 0.0000000 70.49108 1.000000 21.0000 38.263601
0 2400 98449.00 117195 HomeImp Office 4.000000 0.0000000 0.0000000 93.81177 0.000000 13.0000 29.681827
1 2500 15000.00 20200 HomeImp Other 18.000000 0.0000000 0.0000000 136.06667 1.000000 19.0000 33.779915
1 2500 25116.00 36350 HomeImp Other 10.000000 1.0000000 2.0000000 276.96667 0.000000 9.0000 33.779915
0 2500 7229.00 44516 HomeImp Self 8.922268 0.0000000 0.0000000 208.00000 0.000000 12.0000 33.779915

Datos de entrenamiento y datos de validación

set.seed(2020)

entrena <- createDataPartition(datosLimpios$BAD, p=0.7, list = FALSE)

datos.Entrena <- datosLimpios[entrena,]
datos.Valida <- datosLimpios[-entrena,]

kable(head(datos.Entrena, 10))
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
2 1 1300 70053.00 68400 HomeImp Other 7.0 0 2 121.83333 0 14 33.77992
5 0 1700 97800.00 112000 HomeImp Office 3.0 0 0 93.33333 0 14 33.77992
6 1 1700 30548.00 40320 HomeImp Other 9.0 0 0 101.46600 1 8 37.11361
8 1 1800 28502.00 43034 HomeImp Other 11.0 0 0 88.76603 0 8 36.88489
9 1 2000 32700.00 46740 HomeImp Other 3.0 0 2 216.93333 1 12 33.77992
10 1 2000 73760.82 62250 HomeImp Sales 16.0 0 0 115.80000 0 13 33.77992
12 1 2000 20627.00 29800 HomeImp Office 11.0 0 1 122.53333 1 9 33.77992
13 1 2000 45000.00 55000 HomeImp Other 3.0 0 0 86.06667 2 25 33.77992
14 0 2000 64536.00 87400 DebtCon Mgr 2.5 0 0 147.13333 0 24 33.77992
15 1 2100 71000.00 83850 HomeImp Other 8.0 0 1 123.00000 0 16 33.77992
kable(head(datos.Valida, 10))
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
1 1 1100 25860.00 39025 HomeImp Other 10.500000 0.0000000 0.0000000 94.36667 1.000000 9.0000 33.77992
3 1 1500 13500.00 16700 HomeImp Other 4.000000 0.0000000 0.0000000 149.46667 1.000000 10.0000 33.77992
4 1 1500 73760.82 101776 DebtCon Other 8.922268 0.2545697 0.4494424 179.76628 1.186055 21.2961 33.77992
7 1 1800 48649.00 57037 HomeImp Other 5.000000 3.0000000 2.0000000 77.10000 1.000000 17.0000 33.77992
11 1 2000 22608.00 101776 DebtCon Other 18.000000 0.2545697 0.4494424 179.76628 1.186055 21.2961 33.77992
17 1 2200 90957.00 102600 HomeImp Mgr 7.000000 2.0000000 6.0000000 122.90000 1.000000 22.0000 33.77992
24 1 2400 18000.00 101776 HomeImp Mgr 22.000000 0.2545697 2.0000000 121.73333 0.000000 10.0000 33.77992
28 1 2500 15000.00 20200 HomeImp Other 18.000000 0.0000000 0.0000000 136.06667 1.000000 19.0000 33.77992
36 0 2900 104373.00 120702 HomeImp Office 2.000000 0.0000000 0.0000000 101.54030 0.000000 13.0000 29.91586
42 1 3000 73760.82 33000 HomeImp Other 1.000000 0.0000000 1.0000000 23.30000 1.000000 2.0000 33.77992

Modelo de Regresión logística

Se utilizan las variables independientes “LOAN”, “VALUE”, “REASON”, “JOB”, “DEROG”, “DELINQ”, “CLAGE”, “NINQ”, “CLNO”, “DEBTINC” porque son sugeridas bajo el criterio AIC del enlace:https://rpubs.com/chidungkt/442168

modelo <- glm(data = datos.Entrena, formula = BAD ~ LOAN + VALUE + REASON + JOB + DEROG + DELINQ + CLAGE + NINQ + CLNO + DEBTINC, family = binomial)
# modelo
summary(modelo)
## 
## Call:
## glm(formula = BAD ~ LOAN + VALUE + REASON + JOB + DEROG + DELINQ + 
##     CLAGE + NINQ + CLNO + DEBTINC, family = binomial, data = datos.Entrena)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1432  -0.6000  -0.4299  -0.2685   3.9161  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.637e+00  3.049e-01  -8.649  < 2e-16 ***
## LOAN          -2.321e-05  4.994e-06  -4.647 3.37e-06 ***
## VALUE          6.437e-07  9.992e-07   0.644 0.519441    
## REASONHomeImp  2.898e-01  9.861e-02   2.939 0.003296 ** 
## JOBOffice     -5.594e-01  1.766e-01  -3.168 0.001537 ** 
## JOBOther       1.977e-01  1.349e-01   1.465 0.142812    
## JOBProfExe    -2.474e-02  1.578e-01  -0.157 0.875395    
## JOBSales       1.057e+00  2.945e-01   3.591 0.000329 ***
## JOBSelf        7.433e-01  2.458e-01   3.023 0.002499 ** 
## DEROG          6.026e-01  5.982e-02  10.074  < 2e-16 ***
## DELINQ         7.605e-01  4.618e-02  16.468  < 2e-16 ***
## CLAGE         -5.837e-03  6.453e-04  -9.045  < 2e-16 ***
## NINQ           1.482e-01  2.440e-02   6.076 1.23e-09 ***
## CLNO          -1.059e-02  4.926e-03  -2.150 0.031531 *  
## DEBTINC        5.330e-02  7.229e-03   7.373 1.67e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4193.6  on 4171  degrees of freedom
## Residual deviance: 3281.1  on 4157  degrees of freedom
## AIC: 3311.1
## 
## Number of Fisher Scoring iterations: 5

Evaluar el modelo

comparar <- data.frame(datos.Entrena$BAD, as.vector(modelo$fitted.values) )
# comparar

comparar <- comparar %>%
    mutate(BAD.Ajustado = if_else (modelo$fitted.values >= 0.5, 1, 0))

names(comparar) <- c("BAD", "Ajustado", "BAD.Ajustado")
kable(head(comparar))
BAD Ajustado BAD.Ajustado
1 0.5809177 1
0 0.1459916 0
1 0.3288479 0
1 0.3099658 0
1 0.4776954 0
1 0.4233458 0

Generando matriz de confusión del modelo generado

matriz_confusion <- table(comparar$BAD, comparar$BAD.Ajustado, dnn = c("BAD", "BAd Ajustado"))
matriz_confusion
##    BAd Ajustado
## BAD    0    1
##   0 3227  104
##   1  585  256

Precisón del modelo

n = nrow(datos.Entrena)
precision <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n

precision
## [1] 0.8348514

El modelo tiene un valor de precisón del 0.8349, significa que acierta en el 83.4851 % de los casos

Analizando el conunto de datos de Validación

predicciones <- predict(modelo, datos.Valida, se.fit = TRUE)

kable(head(predicciones$fit))
x
1 -0.8471469
3 -1.2029851
4 -1.2117388
7 2.4929409
11 -1.2233415
17 4.4342981

Convertir predicciones en probabilidad

predicciones_prob <- exp(predicciones$fit) / (1 + exp(predicciones$fit))

kable(head(predicciones_prob))
x
1 0.3000317
3 0.2309446
4 0.2293935
7 0.9236455
11 0.2273490
17 0.9882757

Evaluando las predicciones

las.predicciones <- cbind(datos.Valida, predicciones_prob)

las.predicciones <- las.predicciones %>%
  mutate(BAD.Predic =  if_else(predicciones_prob > 0.5, 1, 0))
  
kable(head(las.predicciones[,c(1,14,15)]))
BAD predicciones_prob BAD.Predic
1 0.3000317 0
1 0.2309446 0
1 0.2293935 0
1 0.9236455 1
1 0.2273490 0
1 0.9882757 1

Determinando matriz de confusión del conjunto de validación

matriz_confusion <- table(las.predicciones$BAD, las.predicciones$BAD.Predic, dnn = c("BAD", "BAD Predicho"))
matriz_confusion
##    BAD Predicho
## BAD    0    1
##   0 1387   53
##   1  239  109

¿Qué tan preciso fue la predicción?

n = nrow(datos.Valida)
precision <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n

precision
## [1] 0.836689

Predecir cuatro nuevos peticiones de clientes

LOAN <- c(2000, 2000, 2500,2500)
MORTDUE <- c(4000, 64536, 25116, 7229)
VALUE <- c(55000, 87400, 36350, 44516)
REASON <- c("HomeImp", "DebtCon", "HomeImp", "HomeImp")
JOB <- c("Other", "Mgr", "Other", "Self")
YOJ <- c(10, 2.5, 10, 8.92)
DEROG <- c(0, 0, 1, 0)
DELINQ <- c(0, 0, 2, 0)
CLAGE <- c(86.067, 147.13, 276.97, 208)
NINQ <- c(2, 0, 0, 0)
CLNO <- c(25, 24, 9, 12)
DEBTINC <- c(33.7799, 33.7799, 33.7799, 33.7799)

nuevas_peticiones <- data.frame(LOAN, MORTDUE, VALUE, REASON, JOB, YOJ, DEROG, DELINQ, CLAGE, NINQ, CLNO, DEBTINC)
names(nuevas_peticiones) <- c("LOAN","MORTDUE","VALUE","REASON","JOB","YOJ","DEROG","DELINQ","CLAGE","NINQ","CLNO","DEBTINC")

kable(nuevas_peticiones)
LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
2000 4000 55000 HomeImp Other 10.00 0 0 86.067 2 25 33.7799
2000 64536 87400 DebtCon Mgr 2.50 0 0 147.130 0 24 33.7799
2500 25116 36350 HomeImp Other 10.00 1 2 276.970 0 9 33.7799
2500 7229 44516 HomeImp Self 8.92 0 0 208.000 0 12 33.7799

Realizar las nuevas predicciones

predicciones <- predict(modelo, nuevas_peticiones, se.fit = TRUE)

prediccion_prob <- exp(predicciones$fit) / (1 + exp(predicciones$fit))
prediccion_prob <- as.data.frame(prediccion_prob)

names(prediccion_prob) <- c("Prob.Predic")

kable(prediccion_prob)
Prob.Predic
0.3035312
0.1257089
0.5070368
0.2361821

Finalmente

prediccion_prob <- prediccion_prob %>%
    mutate(Prestamo = if_else(Prob.Predic > 0.5, "NO", "SI"))

kable(prediccion_prob)
Prob.Predic Prestamo
0.3035312 SI
0.1257089 SI
0.5070368 NO
0.2361821 SI