Modelos de Regresión Lineal y Regresión Logística

#Instalar librerías
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(quantmod)
## Warning: package 'quantmod' was built under R version 4.3.2
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.3.2
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.2
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()          masks stats::filter()
## ✖ xts::first()             masks dplyr::first()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ xts::last()              masks dplyr::last()
## ✖ purrr::lift()            masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)

#Cargar el archivo
  #bd <- read.csv("Actividad2.csv")
file.choose()
## [1] "C:\\Users\\sguerra\\Downloads\\Actividad 2 (3).Rhtml"
bd <- read_excel("C:\\Users\\sguerra\\Downloads\\default of credit card clients.xls")

#Eliminar y convertir datos
bd <- bd[-1, ] #Eliminar la primera colunmna porque son unicamente descriptores de las variables.
bd <- bd %>%
  mutate(across(where(is.character), as.numeric)) #Convertir todos los datos a tipo numérico porque todos son de tipo caractér.

#Ver estructura de los datos
glimpse(bd)
## Rows: 30,000
## Columns: 25
## $ ID  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,…
## $ X1  <dbl> 20000, 120000, 90000, 50000, 50000, 50000, 500000, 100000, 140000,…
## $ X2  <dbl> 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, …
## $ X3  <dbl> 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 1, 1, 3, 2, …
## $ X4  <dbl> 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 2, 2, 1, …
## $ X5  <dbl> 24, 26, 34, 37, 57, 37, 29, 23, 28, 35, 34, 51, 41, 30, 29, 23, 24…
## $ X6  <dbl> 2, -1, 0, 0, -1, 0, 0, 0, 0, -2, 0, -1, -1, 1, 0, 1, 0, 0, 1, 1, 0…
## $ X7  <dbl> 2, 2, 0, 0, 0, 0, 0, -1, 0, -2, 0, -1, 0, 2, 0, 2, 0, 0, -2, -2, 0…
## $ X8  <dbl> -1, 0, 0, 0, -1, 0, 0, -1, 2, -2, 2, -1, -1, 2, 0, 0, 2, 0, -2, -2…
## $ X9  <dbl> -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,…
## $ X10 <dbl> -2, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,…
## $ X11 <dbl> -2, 2, 0, 0, 0, 0, 0, -1, 0, -1, -1, 2, -1, 2, 0, 0, 2, -1, -2, -2…
## $ X12 <dbl> 3913, 2682, 29239, 46990, 8617, 64400, 367965, 11876, 11285, 0, 11…
## $ X13 <dbl> 3102, 1725, 14027, 48233, 5670, 57069, 412023, 380, 14096, 0, 9787…
## $ X14 <dbl> 689, 2682, 13559, 49291, 35835, 57608, 445007, 601, 12108, 0, 5535…
## $ X15 <dbl> 0, 3272, 14331, 28314, 20940, 19394, 542653, 221, 12211, 0, 2513, …
## $ X16 <dbl> 0, 3455, 14948, 28959, 19146, 19619, 483003, -159, 11793, 13007, 1…
## $ X17 <dbl> 0, 3261, 15549, 29547, 19131, 20024, 473944, 567, 3719, 13912, 373…
## $ X18 <dbl> 0, 0, 1518, 2000, 2000, 2500, 55000, 380, 3329, 0, 2306, 21818, 10…
## $ X19 <dbl> 689, 1000, 1500, 2019, 36681, 1815, 40000, 601, 0, 0, 12, 9966, 65…
## $ X20 <dbl> 0, 1000, 1000, 1200, 10000, 657, 38000, 0, 432, 0, 50, 8583, 6500,…
## $ X21 <dbl> 0, 1000, 1000, 1100, 9000, 1000, 20239, 581, 1000, 13007, 300, 223…
## $ X22 <dbl> 0, 0, 1000, 1069, 689, 1000, 13750, 1687, 1000, 1122, 3738, 0, 287…
## $ X23 <dbl> 0, 2000, 5000, 1000, 679, 800, 13770, 1542, 1000, 0, 66, 3640, 0, …
## $ Y   <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, …
summary(bd)
##        ID              X1                X2              X3       
##  Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 7501   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :15000   Median : 140000   Median :2.000   Median :2.000  
##  Mean   :15000   Mean   : 167484   Mean   :1.604   Mean   :1.853  
##  3rd Qu.:22500   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :30000   Max.   :1000000   Max.   :2.000   Max.   :6.000  
##        X4              X5              X6                X7         
##  Min.   :0.000   Min.   :21.00   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:1.000   1st Qu.:28.00   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median :2.000   Median :34.00   Median : 0.0000   Median : 0.0000  
##  Mean   :1.552   Mean   :35.49   Mean   :-0.0167   Mean   :-0.1338  
##  3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :3.000   Max.   :79.00   Max.   : 8.0000   Max.   : 8.0000  
##        X8                X9               X10               X11         
##  Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.0000  
##  Mean   :-0.1662   Mean   :-0.2207   Mean   :-0.2662   Mean   :-0.2911  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000  
##       X12               X13              X14               X15         
##  Min.   :-165580   Min.   :-69777   Min.   :-157264   Min.   :-170000  
##  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666   1st Qu.:   2327  
##  Median :  22382   Median : 21200   Median :  20089   Median :  19052  
##  Mean   :  51223   Mean   : 49179   Mean   :  47013   Mean   :  43263  
##  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165   3rd Qu.:  54506  
##  Max.   : 964511   Max.   :983931   Max.   :1664089   Max.   : 891586  
##       X16              X17               X18              X19         
##  Min.   :-81334   Min.   :-339603   Min.   :     0   Min.   :      0  
##  1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000   1st Qu.:    833  
##  Median : 18105   Median :  17071   Median :  2100   Median :   2009  
##  Mean   : 40311   Mean   :  38872   Mean   :  5664   Mean   :   5921  
##  3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006   3rd Qu.:   5000  
##  Max.   :927171   Max.   : 961664   Max.   :873552   Max.   :1684259  
##       X20              X21              X22                X23          
##  Min.   :     0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1800   Median :  1500   Median :  1500.0   Median :  1500.0  
##  Mean   :  5226   Mean   :  4826   Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :896040   Max.   :621000   Max.   :426529.0   Max.   :528666.0  
##        Y         
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2212  
##  3rd Qu.:0.0000  
##  Max.   :1.0000

Y: Default (Yes = 1, No = 0) - Whether the individual has defaulted on their credit. X1: Amount of Credit - The credit limit provided to the individual by the bank. X2: Gender (1 = male, 2 = female) - The gender of the individual. X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others) - The highest level of education of the individual. X4: Marital Status (1 = married; 2 = single; 3 = others) - The marital status of the individual. X5: Age - The age of the individual in years. X6-X11: History of Past Payment - The individual's past payment history, from April to September 2005, where the scale starts from pay duly to a delay of payment for nine months and above. X12-X17: Amount of Bill Statement - The amount of the bill statement from April to September 2005. X18-X23: Amount of Previous Payment - The amount of previous payments made from April to September 2005.

#Regresión lineal
LMcredito <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23,
                data=bd)
summary(LMcredito)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + 
##     X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + 
##     X20 + X21 + X22 + X23, data = bd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29527 -0.24079 -0.16177  0.03385  1.30480 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.142e-01  1.791e-02  17.541  < 2e-16 ***
## X1          -9.053e-08  2.159e-08  -4.193 2.76e-05 ***
## X2          -1.453e-02  4.642e-03  -3.130  0.00175 ** 
## X3          -1.513e-02  3.012e-03  -5.022 5.15e-07 ***
## X4          -2.382e-02  4.768e-03  -4.996 5.88e-07 ***
## X5           1.409e-03  2.749e-04   5.128 2.95e-07 ***
## X6           9.571e-02  2.766e-03  34.596  < 2e-16 ***
## X7           1.946e-02  3.339e-03   5.828 5.68e-09 ***
## X8           1.167e-02  3.585e-03   3.256  0.00113 ** 
## X9           3.362e-03  3.974e-03   0.846  0.39755    
## X10          5.699e-03  4.304e-03   1.324  0.18545    
## X11          7.920e-04  3.521e-03   0.225  0.82201    
## X12         -6.225e-07  1.141e-07  -5.453 4.98e-08 ***
## X13          1.587e-07  1.603e-07   0.990  0.32225    
## X14          3.005e-08  1.510e-07   0.199  0.84222    
## X15         -6.793e-08  1.573e-07  -0.432  0.66587    
## X16         -2.049e-08  1.845e-07  -0.111  0.91159    
## X17          1.153e-07  1.460e-07   0.789  0.42998    
## X18         -7.437e-07  1.770e-07  -4.201 2.67e-05 ***
## X19         -2.092e-07  1.457e-07  -1.436  0.15095    
## X20         -2.874e-08  1.689e-07  -0.170  0.86492    
## X21         -2.521e-07  1.839e-07  -1.371  0.17047    
## X22         -3.410e-07  1.908e-07  -1.787  0.07393 .  
## X23         -9.770e-08  1.365e-07  -0.716  0.47422    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3886 on 29976 degrees of freedom
## Multiple R-squared:  0.124,  Adjusted R-squared:  0.1233 
## F-statistic: 184.5 on 23 and 29976 DF,  p-value: < 2.2e-16
#Regresión lineal con variables significativas
LMcredito_sig <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18 + X22,
                    data=bd)
summary(LMcredito_sig)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + 
##     X18 + X22, data = bd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.30289 -0.23964 -0.16149  0.03257  1.24712 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.131e-01  1.791e-02  17.482  < 2e-16 ***
## X1          -1.038e-07  2.102e-08  -4.937 7.99e-07 ***
## X2          -1.432e-02  4.640e-03  -3.086  0.00203 ** 
## X3          -1.526e-02  3.010e-03  -5.070 4.00e-07 ***
## X4          -2.383e-02  4.767e-03  -4.999 5.79e-07 ***
## X5           1.411e-03  2.748e-04   5.132 2.88e-07 ***
## X6           9.718e-02  2.741e-03  35.455  < 2e-16 ***
## X7           2.020e-02  3.306e-03   6.110 1.01e-09 ***
## X8           1.779e-02  2.983e-03   5.964 2.49e-09 ***
## X12         -4.507e-07  3.462e-08 -13.017  < 2e-16 ***
## X18         -7.533e-07  1.406e-07  -5.357 8.54e-08 ***
## X22         -3.504e-07  1.523e-07  -2.300  0.02143 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3887 on 29988 degrees of freedom
## Multiple R-squared:  0.1234, Adjusted R-squared:  0.123 
## F-statistic: 383.6 on 11 and 29988 DF,  p-value: < 2.2e-16
#Regresión logística
GLMcredit <- glm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18 + X22,
                 family=binomial, data=bd)
summary(GLMcredit)
## 
## Call:
## glm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + 
##     X18 + X22, family = binomial, data = bd)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.163e-01  1.185e-01  -6.047 1.48e-09 ***
## X1          -9.879e-07  1.518e-07  -6.509 7.58e-11 ***
## X2          -1.027e-01  3.063e-02  -3.352 0.000803 ***
## X3          -1.055e-01  2.088e-02  -5.050 4.42e-07 ***
## X4          -1.573e-01  3.166e-02  -4.968 6.77e-07 ***
## X5           7.664e-03  1.777e-03   4.312 1.62e-05 ***
## X6           5.960e-01  1.752e-02  34.018  < 2e-16 ***
## X7           7.974e-02  1.991e-02   4.004 6.22e-05 ***
## X8           1.284e-01  1.837e-02   6.988 2.79e-12 ***
## X12         -1.891e-06  2.621e-07  -7.216 5.34e-13 ***
## X18         -1.346e-05  2.111e-06  -6.378 1.79e-10 ***
## X22         -4.583e-06  1.529e-06  -2.998 0.002718 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31705  on 29999  degrees of freedom
## Residual deviance: 27956  on 29988  degrees of freedom
## AIC: 27980
## 
## Number of Fisher Scoring iterations: 5
# Suponiendo que 'new_data' es tu nuevo conjunto de datos con valores reales para las variables X1, X2, ..., X22.
# Deberías crear este data.frame con los datos reales que quieres probar.
new_data <- data.frame(
  X1 = c(50000),   # Ejemplo de monto de crédito
  X2 = c(1),       # Ejemplo de género
  X3 = c(2),       # Ejemplo de educación
  X4 = c(1),       # Ejemplo de estado civil
  X5 = c(57),      # Ejemplo de edad
  X6 = c(-1),       # Ejemplo de historial de pagos
  X7 = c(0),       # Continuar con datos reales para cada variable...
  X8 = c(-1),
  X12 = c(8617),
  X18 = c(2000),
  X22 = c(689)
)

# Utilizar el modelo para hacer predicciones
probabilidades <- predict(GLMcredit, newdata = new_data, type = "response")

# Las probabilidades te darán la probabilidad predicha de incumplimiento para cada observación en 'new_data'
print(probabilidades)
##         1 
## 0.1721439