Identificar mediante modelo de regresión logística la probabilidad de pago o no Pago de un cliente

Las librerías
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(dplyr)
library(ggplot2)

Los datos

datos <- Default
head(datos)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559
tail(datos)
##       default student   balance   income
## 9995       No     Yes  172.4130 14955.94
## 9996       No      No  711.5550 52992.38
## 9997       No      No  757.9629 19660.72
## 9998       No      No  845.4120 58636.16
## 9999       No      No 1569.0091 36669.11
## 10000      No     Yes  200.9222 16862.95

Recodificar valores

datos <- datos %>%
  select(default, balance) %>%
  mutate(default = recode(default,
                          "No"  = 0,
                          "Yes" = 1))
tail(datos)
##       default   balance
## 9995        0  172.4130
## 9996        0  711.5550
## 9997        0  757.9629
## 9998        0  845.4120
## 9999        0 1569.0091
## 10000       0  200.9222

Modelo de regresión lineal lm()

modelo <- lm(default ~ balance, data = datos)


ggplot(data = datos, aes(x = balance, y = default)) +
  geom_point(aes(color = as.factor(default)), shape = 1) + 
  geom_smooth(method = "lm", color = "gray20", se = FALSE) +
  theme_bw()  +
  labs(title = "Regresión lineal por mínimos cuadrados",
       y = "Probabilidad default") +
  theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'

head(datos)
##   default   balance
## 1       0  729.5265
## 2       0  817.1804
## 3       0 1073.5492
## 4       0  529.2506
## 5       0  785.6559
## 6       0  919.5885

Regresión logística

modelo <- glm(default ~ balance, data = datos, family = "binomial")
summary(modelo)
## 
## Call:
## glm(formula = default ~ balance, family = "binomial", data = datos)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2697  -0.1465  -0.0589  -0.0221   3.7589  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

Representación gráfica del modelo probabilidad

ggplot(data = datos, aes(x = balance, y = default)) +
  geom_point(aes(color = as.factor(default)), shape = 1) + 
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              color = "gray20",
              se = FALSE) +
  theme_bw() +
  theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'

Determinarndo una probabilidad

# Para el caso de un cliente de 1500 y 2500 en Balance


prediccion <- predict(object = modelo, newdata = data.frame(balance = c(1500,2500)), se.fit = TRUE)
prediccion
## $fit
##         1         2 
## -2.402955  3.095962 
## 
## $se.fit
##          1          2 
## 0.07202836 0.20760034 
## 
## $residual.scale
## [1] 1
predicciones.prob <- exp(prediccion$fit) / (1 + exp(prediccion$fit))


predicciones.prob
##          1          2 
## 0.08294762 0.95672586
# Para el caso de un cliente de 1000 y 1500 en Balance


prediccion1 <- predict(object = modelo, newdata = data.frame(balance = c(1000,1500)), se.fit = TRUE)
prediccion1
## $fit
##         1         2 
## -5.152414 -2.402955 
## 
## $se.fit
##          1          2 
## 0.15051722 0.07202836 
## 
## $residual.scale
## [1] 1
predicciones1.prob <- exp(prediccion1$fit) / (1 + exp(prediccion1$fit))


predicciones1.prob
##           1           2 
## 0.005752145 0.082947624
# Para el caso de un cliente de 2000 y 2500 en Balance


prediccion2 <- predict(object = modelo, newdata = data.frame(balance = c(2000,2500)), se.fit = TRUE)
prediccion2
## $fit
##         1         2 
## 0.3465032 3.0959617 
## 
## $se.fit
##         1         2 
## 0.1095547 0.2076003 
## 
## $residual.scale
## [1] 1
predicciones2.prob <- exp(prediccion2$fit) / (1 + exp(prediccion2$fit))


predicciones2.prob
##         1         2 
## 0.5857694 0.9567259

interpretaicon