Modelos de Regresión Lineal y Regresión Logística
# Instalar y cargar librerías library(dplyr)
library(ggplot2) library(quantmod)
## Warning: package 'quantmod' was built under R version 4.3.2
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.3.2
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
# Definir colores rojo <- '#FF6262' azul <- '#3434FF' # Cargar y preparar el archivo original bd <- read.csv("C:\\Users\\Deisy Lopez\\Documents\\RStudio_MPF\\Actividad2.csv") bd <- bd[-1, ] bd <- bd %>% mutate(across(where(is.character), as.numeric)) glimpse(bd)
## Rows: 30,000 ## Columns: 25 ## $ X <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,… ## $ X1 <dbl> 20000, 120000, 90000, 50000, 50000, 50000, 500000, 100000, 140000,… ## $ X2 <dbl> 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, … ## $ X3 <dbl> 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 1, 1, 3, 2, … ## $ X4 <dbl> 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 2, 2, 1, … ## $ X5 <dbl> 24, 26, 34, 37, 57, 37, 29, 23, 28, 35, 34, 51, 41, 30, 29, 23, 24… ## $ X6 <dbl> 2, -1, 0, 0, -1, 0, 0, 0, 0, -2, 0, -1, -1, 1, 0, 1, 0, 0, 1, 1, 0… ## $ X7 <dbl> 2, 2, 0, 0, 0, 0, 0, -1, 0, -2, 0, -1, 0, 2, 0, 2, 0, 0, -2, -2, 0… ## $ X8 <dbl> -1, 0, 0, 0, -1, 0, 0, -1, 2, -2, 2, -1, -1, 2, 0, 0, 2, 0, -2, -2… ## $ X9 <dbl> -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,… ## $ X10 <dbl> -2, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,… ## $ X11 <dbl> -2, 2, 0, 0, 0, 0, 0, -1, 0, -1, -1, 2, -1, 2, 0, 0, 2, -1, -2, -2… ## $ X12 <dbl> 3913, 2682, 29239, 46990, 8617, 64400, 367965, 11876, 11285, 0, 11… ## $ X13 <dbl> 3102, 1725, 14027, 48233, 5670, 57069, 412023, 380, 14096, 0, 9787… ## $ X14 <dbl> 689, 2682, 13559, 49291, 35835, 57608, 445007, 601, 12108, 0, 5535… ## $ X15 <dbl> 0, 3272, 14331, 28314, 20940, 19394, 542653, 221, 12211, 0, 2513, … ## $ X16 <dbl> 0, 3455, 14948, 28959, 19146, 19619, 483003, -159, 11793, 13007, 1… ## $ X17 <dbl> 0, 3261, 15549, 29547, 19131, 20024, 473944, 567, 3719, 13912, 373… ## $ X18 <dbl> 0, 0, 1518, 2000, 2000, 2500, 55000, 380, 3329, 0, 2306, 21818, 10… ## $ X19 <dbl> 689, 1000, 1500, 2019, 36681, 1815, 40000, 601, 0, 0, 12, 9966, 65… ## $ X20 <dbl> 0, 1000, 1000, 1200, 10000, 657, 38000, 0, 432, 0, 50, 8583, 6500,… ## $ X21 <dbl> 0, 1000, 1000, 1100, 9000, 1000, 20239, 581, 1000, 13007, 300, 223… ## $ X22 <dbl> 0, 0, 1000, 1069, 689, 1000, 13750, 1687, 1000, 1122, 3738, 0, 287… ## $ X23 <dbl> 0, 2000, 5000, 1000, 679, 800, 13770, 1542, 1000, 0, 66, 3640, 0, … ## $ Y <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, …
summary(bd)
## X X1 X2 X3 ## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000 ## 1st Qu.: 7501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000 ## Median :15000 Median : 140000 Median :2.000 Median :2.000 ## Mean :15000 Mean : 167484 Mean :1.604 Mean :1.853 ## 3rd Qu.:22500 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000 ## Max. :30000 Max. :1000000 Max. :2.000 Max. :6.000 ## X4 X5 X6 X7 ## Min. :0.000 Min. :21.00 Min. :-2.0000 Min. :-2.0000 ## 1st Qu.:1.000 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000 ## Median :2.000 Median :34.00 Median : 0.0000 Median : 0.0000 ## Mean :1.552 Mean :35.49 Mean :-0.0167 Mean :-0.1338 ## 3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000 ## Max. :3.000 Max. :79.00 Max. : 8.0000 Max. : 8.0000 ## X8 X9 X10 X11 ## Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 ## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 ## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000 ## Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911 ## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 ## Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 ## X12 X13 X14 X15 ## Min. :-165580 Min. :-69777 Min. :-157264 Min. :-170000 ## 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666 1st Qu.: 2327 ## Median : 22382 Median : 21200 Median : 20089 Median : 19052 ## Mean : 51223 Mean : 49179 Mean : 47013 Mean : 43263 ## 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165 3rd Qu.: 54506 ## Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586 ## X16 X17 X18 X19 ## Min. :-81334 Min. :-339603 Min. : 0 Min. : 0 ## 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000 1st Qu.: 833 ## Median : 18105 Median : 17071 Median : 2100 Median : 2009 ## Mean : 40311 Mean : 38872 Mean : 5664 Mean : 5921 ## 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006 3rd Qu.: 5000 ## Max. :927171 Max. : 961664 Max. :873552 Max. :1684259 ## X20 X21 X22 X23 ## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0 ## 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5 1st Qu.: 117.8 ## Median : 1800 Median : 1500 Median : 1500.0 Median : 1500.0 ## Mean : 5226 Mean : 4826 Mean : 4799.4 Mean : 5215.5 ## 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5 3rd Qu.: 4000.0 ## Max. :896040 Max. :621000 Max. :426529.0 Max. :528666.0 ## Y ## Min. :0.0000 ## 1st Qu.:0.0000 ## Median :0.0000 ## Mean :0.2212 ## 3rd Qu.:0.0000 ## Max. :1.0000
# Crear una gráfica profesional con los datos originales bd %>% ggplot(aes(x = X5, y = X1, color = factor(Y))) + geom_point(size = 1, alpha = 0.6) + labs(title = "Edad vs Monto del Crédito", subtitle = "Color por Incumplimiento", x = "Edad", y = "Monto del Crédito") + scale_color_manual(values = c(rojo, azul))
Modelos de Regresión
#Regresión lineal LMcredito <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23, data=bd) summary(LMcredito)
## ## Call: ## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + ## X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + ## X20 + X21 + X22 + X23, data = bd) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.29527 -0.24079 -0.16177 0.03385 1.30480 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 3.142e-01 1.791e-02 17.541 < 2e-16 *** ## X1 -9.053e-08 2.159e-08 -4.193 2.76e-05 *** ## X2 -1.453e-02 4.642e-03 -3.130 0.00175 ** ## X3 -1.513e-02 3.012e-03 -5.022 5.15e-07 *** ## X4 -2.382e-02 4.768e-03 -4.996 5.88e-07 *** ## X5 1.409e-03 2.749e-04 5.128 2.95e-07 *** ## X6 9.571e-02 2.766e-03 34.596 < 2e-16 *** ## X7 1.946e-02 3.339e-03 5.828 5.68e-09 *** ## X8 1.167e-02 3.585e-03 3.256 0.00113 ** ## X9 3.362e-03 3.974e-03 0.846 0.39755 ## X10 5.699e-03 4.304e-03 1.324 0.18545 ## X11 7.920e-04 3.521e-03 0.225 0.82201 ## X12 -6.225e-07 1.141e-07 -5.453 4.98e-08 *** ## X13 1.587e-07 1.603e-07 0.990 0.32225 ## X14 3.005e-08 1.510e-07 0.199 0.84222 ## X15 -6.793e-08 1.573e-07 -0.432 0.66587 ## X16 -2.049e-08 1.845e-07 -0.111 0.91159 ## X17 1.153e-07 1.460e-07 0.789 0.42998 ## X18 -7.437e-07 1.770e-07 -4.201 2.67e-05 *** ## X19 -2.092e-07 1.457e-07 -1.436 0.15095 ## X20 -2.874e-08 1.689e-07 -0.170 0.86492 ## X21 -2.521e-07 1.839e-07 -1.371 0.17047 ## X22 -3.410e-07 1.908e-07 -1.787 0.07393 . ## X23 -9.770e-08 1.365e-07 -0.716 0.47422 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.3886 on 29976 degrees of freedom ## Multiple R-squared: 0.124, Adjusted R-squared: 0.1233 ## F-statistic: 184.5 on 23 and 29976 DF, p-value: < 2.2e-16
#Regresión lineal con variables significativas LMcredito_sig <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18, data=bd) summary(LMcredito_sig)
## ## Call: ## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + ## X18, data = bd) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.30307 -0.23949 -0.16138 0.03274 1.26179 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 3.133e-01 1.791e-02 17.495 < 2e-16 *** ## X1 -1.105e-07 2.082e-08 -5.309 1.11e-07 *** ## X2 -1.429e-02 4.640e-03 -3.080 0.00207 ** ## X3 -1.528e-02 3.011e-03 -5.075 3.91e-07 *** ## X4 -2.403e-02 4.767e-03 -5.041 4.67e-07 *** ## X5 1.412e-03 2.749e-04 5.138 2.79e-07 *** ## X6 9.737e-02 2.740e-03 35.537 < 2e-16 *** ## X7 2.007e-02 3.306e-03 6.071 1.28e-09 *** ## X8 1.785e-02 2.983e-03 5.983 2.22e-09 *** ## X12 -4.587e-07 3.445e-08 -13.315 < 2e-16 *** ## X18 -7.858e-07 1.399e-07 -5.616 1.97e-08 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.3887 on 29989 degrees of freedom ## Multiple R-squared: 0.1232, Adjusted R-squared: 0.1229 ## F-statistic: 421.4 on 10 and 29989 DF, p-value: < 2.2e-16
# Regresión Logística GLMcredit <- glm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23, family=binomial, data=bd)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(GLMcredit)
## ## Call: ## glm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + ## X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + ## X20 + X21 + X22 + X23, family = binomial, data = bd) ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -6.863e-01 1.187e-01 -5.784 7.30e-09 *** ## X1 -7.623e-07 1.569e-07 -4.859 1.18e-06 *** ## X2 -1.087e-01 3.069e-02 -3.541 0.000399 *** ## X3 -1.016e-01 2.097e-02 -4.844 1.27e-06 *** ## X4 -1.544e-01 3.170e-02 -4.869 1.12e-06 *** ## X5 7.420e-03 1.779e-03 4.170 3.04e-05 *** ## X6 5.774e-01 1.769e-02 32.632 < 2e-16 *** ## X7 8.282e-02 2.018e-02 4.103 4.07e-05 *** ## X8 7.214e-02 2.260e-02 3.192 0.001415 ** ## X9 2.389e-02 2.500e-02 0.956 0.339312 ## X10 3.401e-02 2.688e-02 1.266 0.205685 ## X11 8.038e-03 2.213e-02 0.363 0.716448 ## X12 -5.492e-06 1.136e-06 -4.835 1.33e-06 *** ## X13 2.356e-06 1.504e-06 1.566 0.117280 ## X14 1.365e-06 1.323e-06 1.032 0.302073 ## X15 -1.821e-07 1.349e-06 -0.135 0.892609 ## X16 6.155e-07 1.518e-06 0.405 0.685246 ## X17 3.938e-07 1.195e-06 0.330 0.741692 ## X18 -1.363e-05 2.305e-06 -5.913 3.36e-09 *** ## X19 -9.616e-06 2.095e-06 -4.590 4.42e-06 *** ## X20 -2.742e-06 1.723e-06 -1.592 0.111456 ## X21 -4.023e-06 1.785e-06 -2.254 0.024185 * ## X22 -3.311e-06 1.777e-06 -1.864 0.062387 . ## X23 -2.064e-06 1.296e-06 -1.593 0.111212 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 31705 on 29999 degrees of freedom ## Residual deviance: 27877 on 29976 degrees of freedom ## AIC: 27925 ## ## Number of Fisher Scoring iterations: 6
#Regresión logística valores significativos GLMcredit_sig <- glm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18 + X19, family=binomial, data=bd) summary(GLMcredit_sig)
## ## Call: ## glm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + ## X18 + X19, family = binomial, data = bd) ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -7.075e-01 1.184e-01 -5.975 2.31e-09 *** ## X1 -9.505e-07 1.515e-07 -6.273 3.55e-10 *** ## X2 -1.042e-01 3.063e-02 -3.403 0.000666 *** ## X3 -1.045e-01 2.090e-02 -5.001 5.70e-07 *** ## X4 -1.574e-01 3.165e-02 -4.974 6.57e-07 *** ## X5 7.634e-03 1.777e-03 4.296 1.74e-05 *** ## X6 5.938e-01 1.753e-02 33.875 < 2e-16 *** ## X7 8.373e-02 1.993e-02 4.201 2.65e-05 *** ## X8 1.208e-01 1.839e-02 6.569 5.06e-11 *** ## X12 -1.844e-06 2.627e-07 -7.020 2.21e-12 *** ## X18 -1.217e-05 2.095e-06 -5.811 6.20e-09 *** ## X19 -8.591e-06 1.806e-06 -4.756 1.98e-06 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 31705 on 29999 degrees of freedom ## Residual deviance: 27935 on 29988 degrees of freedom ## AIC: 27959 ## ## Number of Fisher Scoring iterations: 6
# Gráfica bd$prediccion_GLM <- predict(GLMcredit_sig, newdata = bd, type = "response") bd %>% ggplot(aes(x = X5, y = prediccion_GLM, color = factor(Y))) + geom_point(size = 1, alpha = 0.6) + labs(title = "Predicción de Default vs Edad", subtitle = "Predicciones de Regresión Logística", x = "Edad", y = "Probabilidad de Default") + scale_color_manual(values = c(rojo, azul))
#Confusion Matrix predicted_class <- ifelse(bd$prediccion_GLM > 0.5, 1, 0) confusionMatrix(as.factor(predicted_class), as.factor(bd$Y))
## Confusion Matrix and Statistics ## ## Reference ## Prediction 0 1 ## 0 22721 5080 ## 1 643 1556 ## ## Accuracy : 0.8092 ## 95% CI : (0.8047, 0.8137) ## No Information Rate : 0.7788 ## P-Value [Acc > NIR] : < 2.2e-16 ## ## Kappa : 0.2721 ## ## Mcnemar's Test P-Value : < 2.2e-16 ## ## Sensitivity : 0.9725 ## Specificity : 0.2345 ## Pos Pred Value : 0.8173 ## Neg Pred Value : 0.7076 ## Prevalence : 0.7788 ## Detection Rate : 0.7574 ## Detection Prevalence : 0.9267 ## Balanced Accuracy : 0.6035 ## ## 'Positive' Class : 0 ##