Modelos de Regresión Lineal y Regresión Logística

# Instalar y cargar librerías
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(quantmod)
## Warning: package 'quantmod' was built under R version 4.3.2
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.3.2
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.2
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()          masks stats::filter()
## ✖ xts::first()             masks dplyr::first()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ xts::last()              masks dplyr::last()
## ✖ purrr::lift()            masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Definir colores
rojo <- '#FF6262'
azul <- '#3434FF'

# Cargar y preparar el archivo original
bd <- read.csv("C:\\Users\\Deisy Lopez\\Documents\\RStudio_MPF\\Actividad2.csv")
bd <- bd[-1, ]
bd <- bd %>% mutate(across(where(is.character), as.numeric))

glimpse(bd)
## Rows: 30,000
## Columns: 25
## $ X   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,…
## $ X1  <dbl> 20000, 120000, 90000, 50000, 50000, 50000, 500000, 100000, 140000,…
## $ X2  <dbl> 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, …
## $ X3  <dbl> 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 1, 1, 3, 2, …
## $ X4  <dbl> 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 2, 2, 1, …
## $ X5  <dbl> 24, 26, 34, 37, 57, 37, 29, 23, 28, 35, 34, 51, 41, 30, 29, 23, 24…
## $ X6  <dbl> 2, -1, 0, 0, -1, 0, 0, 0, 0, -2, 0, -1, -1, 1, 0, 1, 0, 0, 1, 1, 0…
## $ X7  <dbl> 2, 2, 0, 0, 0, 0, 0, -1, 0, -2, 0, -1, 0, 2, 0, 2, 0, 0, -2, -2, 0…
## $ X8  <dbl> -1, 0, 0, 0, -1, 0, 0, -1, 2, -2, 2, -1, -1, 2, 0, 0, 2, 0, -2, -2…
## $ X9  <dbl> -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,…
## $ X10 <dbl> -2, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,…
## $ X11 <dbl> -2, 2, 0, 0, 0, 0, 0, -1, 0, -1, -1, 2, -1, 2, 0, 0, 2, -1, -2, -2…
## $ X12 <dbl> 3913, 2682, 29239, 46990, 8617, 64400, 367965, 11876, 11285, 0, 11…
## $ X13 <dbl> 3102, 1725, 14027, 48233, 5670, 57069, 412023, 380, 14096, 0, 9787…
## $ X14 <dbl> 689, 2682, 13559, 49291, 35835, 57608, 445007, 601, 12108, 0, 5535…
## $ X15 <dbl> 0, 3272, 14331, 28314, 20940, 19394, 542653, 221, 12211, 0, 2513, …
## $ X16 <dbl> 0, 3455, 14948, 28959, 19146, 19619, 483003, -159, 11793, 13007, 1…
## $ X17 <dbl> 0, 3261, 15549, 29547, 19131, 20024, 473944, 567, 3719, 13912, 373…
## $ X18 <dbl> 0, 0, 1518, 2000, 2000, 2500, 55000, 380, 3329, 0, 2306, 21818, 10…
## $ X19 <dbl> 689, 1000, 1500, 2019, 36681, 1815, 40000, 601, 0, 0, 12, 9966, 65…
## $ X20 <dbl> 0, 1000, 1000, 1200, 10000, 657, 38000, 0, 432, 0, 50, 8583, 6500,…
## $ X21 <dbl> 0, 1000, 1000, 1100, 9000, 1000, 20239, 581, 1000, 13007, 300, 223…
## $ X22 <dbl> 0, 0, 1000, 1069, 689, 1000, 13750, 1687, 1000, 1122, 3738, 0, 287…
## $ X23 <dbl> 0, 2000, 5000, 1000, 679, 800, 13770, 1542, 1000, 0, 66, 3640, 0, …
## $ Y   <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, …
summary(bd)
##        X               X1                X2              X3       
##  Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 7501   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :15000   Median : 140000   Median :2.000   Median :2.000  
##  Mean   :15000   Mean   : 167484   Mean   :1.604   Mean   :1.853  
##  3rd Qu.:22500   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :30000   Max.   :1000000   Max.   :2.000   Max.   :6.000  
##        X4              X5              X6                X7         
##  Min.   :0.000   Min.   :21.00   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:1.000   1st Qu.:28.00   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median :2.000   Median :34.00   Median : 0.0000   Median : 0.0000  
##  Mean   :1.552   Mean   :35.49   Mean   :-0.0167   Mean   :-0.1338  
##  3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :3.000   Max.   :79.00   Max.   : 8.0000   Max.   : 8.0000  
##        X8                X9               X10               X11         
##  Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.0000  
##  Mean   :-0.1662   Mean   :-0.2207   Mean   :-0.2662   Mean   :-0.2911  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000  
##       X12               X13              X14               X15         
##  Min.   :-165580   Min.   :-69777   Min.   :-157264   Min.   :-170000  
##  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666   1st Qu.:   2327  
##  Median :  22382   Median : 21200   Median :  20089   Median :  19052  
##  Mean   :  51223   Mean   : 49179   Mean   :  47013   Mean   :  43263  
##  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165   3rd Qu.:  54506  
##  Max.   : 964511   Max.   :983931   Max.   :1664089   Max.   : 891586  
##       X16              X17               X18              X19         
##  Min.   :-81334   Min.   :-339603   Min.   :     0   Min.   :      0  
##  1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000   1st Qu.:    833  
##  Median : 18105   Median :  17071   Median :  2100   Median :   2009  
##  Mean   : 40311   Mean   :  38872   Mean   :  5664   Mean   :   5921  
##  3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006   3rd Qu.:   5000  
##  Max.   :927171   Max.   : 961664   Max.   :873552   Max.   :1684259  
##       X20              X21              X22                X23          
##  Min.   :     0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1800   Median :  1500   Median :  1500.0   Median :  1500.0  
##  Mean   :  5226   Mean   :  4826   Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :896040   Max.   :621000   Max.   :426529.0   Max.   :528666.0  
##        Y         
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2212  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
# Crear una gráfica profesional con los datos originales
bd %>%
  ggplot(aes(x = X5, y = X1, color = factor(Y))) +
  geom_point(size = 1, alpha = 0.6) +
  labs(title = "Edad vs Monto del Crédito", subtitle = "Color por Incumplimiento",
       x = "Edad", y = "Monto del Crédito") +
  scale_color_manual(values = c(rojo, azul))
plot of chunk unnamed-chunk-1

Modelos de Regresión

#Regresión lineal
LMcredito <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23,
                data=bd)
summary(LMcredito)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + 
##     X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + 
##     X20 + X21 + X22 + X23, data = bd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29527 -0.24079 -0.16177  0.03385  1.30480 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.142e-01  1.791e-02  17.541  < 2e-16 ***
## X1          -9.053e-08  2.159e-08  -4.193 2.76e-05 ***
## X2          -1.453e-02  4.642e-03  -3.130  0.00175 ** 
## X3          -1.513e-02  3.012e-03  -5.022 5.15e-07 ***
## X4          -2.382e-02  4.768e-03  -4.996 5.88e-07 ***
## X5           1.409e-03  2.749e-04   5.128 2.95e-07 ***
## X6           9.571e-02  2.766e-03  34.596  < 2e-16 ***
## X7           1.946e-02  3.339e-03   5.828 5.68e-09 ***
## X8           1.167e-02  3.585e-03   3.256  0.00113 ** 
## X9           3.362e-03  3.974e-03   0.846  0.39755    
## X10          5.699e-03  4.304e-03   1.324  0.18545    
## X11          7.920e-04  3.521e-03   0.225  0.82201    
## X12         -6.225e-07  1.141e-07  -5.453 4.98e-08 ***
## X13          1.587e-07  1.603e-07   0.990  0.32225    
## X14          3.005e-08  1.510e-07   0.199  0.84222    
## X15         -6.793e-08  1.573e-07  -0.432  0.66587    
## X16         -2.049e-08  1.845e-07  -0.111  0.91159    
## X17          1.153e-07  1.460e-07   0.789  0.42998    
## X18         -7.437e-07  1.770e-07  -4.201 2.67e-05 ***
## X19         -2.092e-07  1.457e-07  -1.436  0.15095    
## X20         -2.874e-08  1.689e-07  -0.170  0.86492    
## X21         -2.521e-07  1.839e-07  -1.371  0.17047    
## X22         -3.410e-07  1.908e-07  -1.787  0.07393 .  
## X23         -9.770e-08  1.365e-07  -0.716  0.47422    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3886 on 29976 degrees of freedom
## Multiple R-squared:  0.124,  Adjusted R-squared:  0.1233 
## F-statistic: 184.5 on 23 and 29976 DF,  p-value: < 2.2e-16
#Regresión lineal con variables significativas
LMcredito_sig <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18,
                    data=bd)
summary(LMcredito_sig)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + 
##     X18, data = bd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.30307 -0.23949 -0.16138  0.03274  1.26179 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.133e-01  1.791e-02  17.495  < 2e-16 ***
## X1          -1.105e-07  2.082e-08  -5.309 1.11e-07 ***
## X2          -1.429e-02  4.640e-03  -3.080  0.00207 ** 
## X3          -1.528e-02  3.011e-03  -5.075 3.91e-07 ***
## X4          -2.403e-02  4.767e-03  -5.041 4.67e-07 ***
## X5           1.412e-03  2.749e-04   5.138 2.79e-07 ***
## X6           9.737e-02  2.740e-03  35.537  < 2e-16 ***
## X7           2.007e-02  3.306e-03   6.071 1.28e-09 ***
## X8           1.785e-02  2.983e-03   5.983 2.22e-09 ***
## X12         -4.587e-07  3.445e-08 -13.315  < 2e-16 ***
## X18         -7.858e-07  1.399e-07  -5.616 1.97e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3887 on 29989 degrees of freedom
## Multiple R-squared:  0.1232, Adjusted R-squared:  0.1229 
## F-statistic: 421.4 on 10 and 29989 DF,  p-value: < 2.2e-16
# Regresión Logística
GLMcredit <- glm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23,
                 family=binomial, data=bd)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(GLMcredit)
## 
## Call:
## glm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + 
##     X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + 
##     X20 + X21 + X22 + X23, family = binomial, data = bd)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.863e-01  1.187e-01  -5.784 7.30e-09 ***
## X1          -7.623e-07  1.569e-07  -4.859 1.18e-06 ***
## X2          -1.087e-01  3.069e-02  -3.541 0.000399 ***
## X3          -1.016e-01  2.097e-02  -4.844 1.27e-06 ***
## X4          -1.544e-01  3.170e-02  -4.869 1.12e-06 ***
## X5           7.420e-03  1.779e-03   4.170 3.04e-05 ***
## X6           5.774e-01  1.769e-02  32.632  < 2e-16 ***
## X7           8.282e-02  2.018e-02   4.103 4.07e-05 ***
## X8           7.214e-02  2.260e-02   3.192 0.001415 ** 
## X9           2.389e-02  2.500e-02   0.956 0.339312    
## X10          3.401e-02  2.688e-02   1.266 0.205685    
## X11          8.038e-03  2.213e-02   0.363 0.716448    
## X12         -5.492e-06  1.136e-06  -4.835 1.33e-06 ***
## X13          2.356e-06  1.504e-06   1.566 0.117280    
## X14          1.365e-06  1.323e-06   1.032 0.302073    
## X15         -1.821e-07  1.349e-06  -0.135 0.892609    
## X16          6.155e-07  1.518e-06   0.405 0.685246    
## X17          3.938e-07  1.195e-06   0.330 0.741692    
## X18         -1.363e-05  2.305e-06  -5.913 3.36e-09 ***
## X19         -9.616e-06  2.095e-06  -4.590 4.42e-06 ***
## X20         -2.742e-06  1.723e-06  -1.592 0.111456    
## X21         -4.023e-06  1.785e-06  -2.254 0.024185 *  
## X22         -3.311e-06  1.777e-06  -1.864 0.062387 .  
## X23         -2.064e-06  1.296e-06  -1.593 0.111212    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31705  on 29999  degrees of freedom
## Residual deviance: 27877  on 29976  degrees of freedom
## AIC: 27925
## 
## Number of Fisher Scoring iterations: 6
#Regresión logística valores significativos
GLMcredit_sig <- glm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18 + X19,
                 family=binomial, data=bd)
summary(GLMcredit_sig)
## 
## Call:
## glm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + 
##     X18 + X19, family = binomial, data = bd)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.075e-01  1.184e-01  -5.975 2.31e-09 ***
## X1          -9.505e-07  1.515e-07  -6.273 3.55e-10 ***
## X2          -1.042e-01  3.063e-02  -3.403 0.000666 ***
## X3          -1.045e-01  2.090e-02  -5.001 5.70e-07 ***
## X4          -1.574e-01  3.165e-02  -4.974 6.57e-07 ***
## X5           7.634e-03  1.777e-03   4.296 1.74e-05 ***
## X6           5.938e-01  1.753e-02  33.875  < 2e-16 ***
## X7           8.373e-02  1.993e-02   4.201 2.65e-05 ***
## X8           1.208e-01  1.839e-02   6.569 5.06e-11 ***
## X12         -1.844e-06  2.627e-07  -7.020 2.21e-12 ***
## X18         -1.217e-05  2.095e-06  -5.811 6.20e-09 ***
## X19         -8.591e-06  1.806e-06  -4.756 1.98e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31705  on 29999  degrees of freedom
## Residual deviance: 27935  on 29988  degrees of freedom
## AIC: 27959
## 
## Number of Fisher Scoring iterations: 6
# Gráfica
bd$prediccion_GLM <- predict(GLMcredit_sig, newdata = bd, type = "response")
bd %>%
  ggplot(aes(x = X5, y = prediccion_GLM, color = factor(Y))) +
  geom_point(size = 1, alpha = 0.6) +
  labs(title = "Predicción de Default vs Edad", subtitle = "Predicciones de Regresión Logística",
       x = "Edad", y = "Probabilidad de Default") +
  scale_color_manual(values = c(rojo, azul))
plot of chunk unnamed-chunk-2
#Confusion Matrix
predicted_class <- ifelse(bd$prediccion_GLM > 0.5, 1, 0)
confusionMatrix(as.factor(predicted_class), as.factor(bd$Y))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 22721  5080
##          1   643  1556
##                                           
##                Accuracy : 0.8092          
##                  95% CI : (0.8047, 0.8137)
##     No Information Rate : 0.7788          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2721          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9725          
##             Specificity : 0.2345          
##          Pos Pred Value : 0.8173          
##          Neg Pred Value : 0.7076          
##              Prevalence : 0.7788          
##          Detection Rate : 0.7574          
##    Detection Prevalence : 0.9267          
##       Balanced Accuracy : 0.6035          
##                                           
##        'Positive' Class : 0               
##