Modelos de Regresión Lineal y Regresión Logística
#Instalar librerías library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(quantmod)
## Warning: package 'quantmod' was built under R version 4.3.2
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.3.2
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
library(readxl) #Cargar el archivo #bd <- read.csv("Actividad2.csv") file.choose()
## [1] "C:\\Users\\sguerra\\Downloads\\Actividad 2 (3).Rhtml"
bd <- read_excel("C:\\Users\\sguerra\\Downloads\\default of credit card clients.xls") #Eliminar y convertir datos bd <- bd[-1, ] #Eliminar la primera colunmna porque son unicamente descriptores de las variables. bd <- bd %>% mutate(across(where(is.character), as.numeric)) #Convertir todos los datos a tipo numérico porque todos son de tipo caractér. #Ver estructura de los datos glimpse(bd)
## Rows: 30,000 ## Columns: 25 ## $ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,… ## $ X1 <dbl> 20000, 120000, 90000, 50000, 50000, 50000, 500000, 100000, 140000,… ## $ X2 <dbl> 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, … ## $ X3 <dbl> 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 1, 1, 3, 2, … ## $ X4 <dbl> 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 2, 2, 1, … ## $ X5 <dbl> 24, 26, 34, 37, 57, 37, 29, 23, 28, 35, 34, 51, 41, 30, 29, 23, 24… ## $ X6 <dbl> 2, -1, 0, 0, -1, 0, 0, 0, 0, -2, 0, -1, -1, 1, 0, 1, 0, 0, 1, 1, 0… ## $ X7 <dbl> 2, 2, 0, 0, 0, 0, 0, -1, 0, -2, 0, -1, 0, 2, 0, 2, 0, 0, -2, -2, 0… ## $ X8 <dbl> -1, 0, 0, 0, -1, 0, 0, -1, 2, -2, 2, -1, -1, 2, 0, 0, 2, 0, -2, -2… ## $ X9 <dbl> -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,… ## $ X10 <dbl> -2, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, 0, 0, 0, 2, -1, -2, -2,… ## $ X11 <dbl> -2, 2, 0, 0, 0, 0, 0, -1, 0, -1, -1, 2, -1, 2, 0, 0, 2, -1, -2, -2… ## $ X12 <dbl> 3913, 2682, 29239, 46990, 8617, 64400, 367965, 11876, 11285, 0, 11… ## $ X13 <dbl> 3102, 1725, 14027, 48233, 5670, 57069, 412023, 380, 14096, 0, 9787… ## $ X14 <dbl> 689, 2682, 13559, 49291, 35835, 57608, 445007, 601, 12108, 0, 5535… ## $ X15 <dbl> 0, 3272, 14331, 28314, 20940, 19394, 542653, 221, 12211, 0, 2513, … ## $ X16 <dbl> 0, 3455, 14948, 28959, 19146, 19619, 483003, -159, 11793, 13007, 1… ## $ X17 <dbl> 0, 3261, 15549, 29547, 19131, 20024, 473944, 567, 3719, 13912, 373… ## $ X18 <dbl> 0, 0, 1518, 2000, 2000, 2500, 55000, 380, 3329, 0, 2306, 21818, 10… ## $ X19 <dbl> 689, 1000, 1500, 2019, 36681, 1815, 40000, 601, 0, 0, 12, 9966, 65… ## $ X20 <dbl> 0, 1000, 1000, 1200, 10000, 657, 38000, 0, 432, 0, 50, 8583, 6500,… ## $ X21 <dbl> 0, 1000, 1000, 1100, 9000, 1000, 20239, 581, 1000, 13007, 300, 223… ## $ X22 <dbl> 0, 0, 1000, 1069, 689, 1000, 13750, 1687, 1000, 1122, 3738, 0, 287… ## $ X23 <dbl> 0, 2000, 5000, 1000, 679, 800, 13770, 1542, 1000, 0, 66, 3640, 0, … ## $ Y <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, …
summary(bd)
## ID X1 X2 X3 ## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000 ## 1st Qu.: 7501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000 ## Median :15000 Median : 140000 Median :2.000 Median :2.000 ## Mean :15000 Mean : 167484 Mean :1.604 Mean :1.853 ## 3rd Qu.:22500 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000 ## Max. :30000 Max. :1000000 Max. :2.000 Max. :6.000 ## X4 X5 X6 X7 ## Min. :0.000 Min. :21.00 Min. :-2.0000 Min. :-2.0000 ## 1st Qu.:1.000 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000 ## Median :2.000 Median :34.00 Median : 0.0000 Median : 0.0000 ## Mean :1.552 Mean :35.49 Mean :-0.0167 Mean :-0.1338 ## 3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000 ## Max. :3.000 Max. :79.00 Max. : 8.0000 Max. : 8.0000 ## X8 X9 X10 X11 ## Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 ## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 ## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000 ## Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911 ## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 ## Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 ## X12 X13 X14 X15 ## Min. :-165580 Min. :-69777 Min. :-157264 Min. :-170000 ## 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666 1st Qu.: 2327 ## Median : 22382 Median : 21200 Median : 20089 Median : 19052 ## Mean : 51223 Mean : 49179 Mean : 47013 Mean : 43263 ## 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165 3rd Qu.: 54506 ## Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586 ## X16 X17 X18 X19 ## Min. :-81334 Min. :-339603 Min. : 0 Min. : 0 ## 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000 1st Qu.: 833 ## Median : 18105 Median : 17071 Median : 2100 Median : 2009 ## Mean : 40311 Mean : 38872 Mean : 5664 Mean : 5921 ## 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006 3rd Qu.: 5000 ## Max. :927171 Max. : 961664 Max. :873552 Max. :1684259 ## X20 X21 X22 X23 ## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0 ## 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5 1st Qu.: 117.8 ## Median : 1800 Median : 1500 Median : 1500.0 Median : 1500.0 ## Mean : 5226 Mean : 4826 Mean : 4799.4 Mean : 5215.5 ## 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5 3rd Qu.: 4000.0 ## Max. :896040 Max. :621000 Max. :426529.0 Max. :528666.0 ## Y ## Min. :0.0000 ## 1st Qu.:0.0000 ## Median :0.0000 ## Mean :0.2212 ## 3rd Qu.:0.0000 ## Max. :1.0000
Y: Default (Yes = 1, No = 0) - Whether the individual has defaulted on their credit. X1: Amount of Credit - The credit limit provided to the individual by the bank. X2: Gender (1 = male, 2 = female) - The gender of the individual. X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others) - The highest level of education of the individual. X4: Marital Status (1 = married; 2 = single; 3 = others) - The marital status of the individual. X5: Age - The age of the individual in years. X6-X11: History of Past Payment - The individual's past payment history, from April to September 2005, where the scale starts from pay duly to a delay of payment for nine months and above. X12-X17: Amount of Bill Statement - The amount of the bill statement from April to September 2005. X18-X23: Amount of Previous Payment - The amount of previous payments made from April to September 2005.
#Regresión lineal LMcredito <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23, data=bd) summary(LMcredito)
## ## Call: ## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + ## X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + ## X20 + X21 + X22 + X23, data = bd) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.29527 -0.24079 -0.16177 0.03385 1.30480 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 3.142e-01 1.791e-02 17.541 < 2e-16 *** ## X1 -9.053e-08 2.159e-08 -4.193 2.76e-05 *** ## X2 -1.453e-02 4.642e-03 -3.130 0.00175 ** ## X3 -1.513e-02 3.012e-03 -5.022 5.15e-07 *** ## X4 -2.382e-02 4.768e-03 -4.996 5.88e-07 *** ## X5 1.409e-03 2.749e-04 5.128 2.95e-07 *** ## X6 9.571e-02 2.766e-03 34.596 < 2e-16 *** ## X7 1.946e-02 3.339e-03 5.828 5.68e-09 *** ## X8 1.167e-02 3.585e-03 3.256 0.00113 ** ## X9 3.362e-03 3.974e-03 0.846 0.39755 ## X10 5.699e-03 4.304e-03 1.324 0.18545 ## X11 7.920e-04 3.521e-03 0.225 0.82201 ## X12 -6.225e-07 1.141e-07 -5.453 4.98e-08 *** ## X13 1.587e-07 1.603e-07 0.990 0.32225 ## X14 3.005e-08 1.510e-07 0.199 0.84222 ## X15 -6.793e-08 1.573e-07 -0.432 0.66587 ## X16 -2.049e-08 1.845e-07 -0.111 0.91159 ## X17 1.153e-07 1.460e-07 0.789 0.42998 ## X18 -7.437e-07 1.770e-07 -4.201 2.67e-05 *** ## X19 -2.092e-07 1.457e-07 -1.436 0.15095 ## X20 -2.874e-08 1.689e-07 -0.170 0.86492 ## X21 -2.521e-07 1.839e-07 -1.371 0.17047 ## X22 -3.410e-07 1.908e-07 -1.787 0.07393 . ## X23 -9.770e-08 1.365e-07 -0.716 0.47422 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.3886 on 29976 degrees of freedom ## Multiple R-squared: 0.124, Adjusted R-squared: 0.1233 ## F-statistic: 184.5 on 23 and 29976 DF, p-value: < 2.2e-16
#Regresión lineal con variables significativas LMcredito_sig <- lm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18 + X22, data=bd) summary(LMcredito_sig)
## ## Call: ## lm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + ## X18 + X22, data = bd) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.30289 -0.23964 -0.16149 0.03257 1.24712 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 3.131e-01 1.791e-02 17.482 < 2e-16 *** ## X1 -1.038e-07 2.102e-08 -4.937 7.99e-07 *** ## X2 -1.432e-02 4.640e-03 -3.086 0.00203 ** ## X3 -1.526e-02 3.010e-03 -5.070 4.00e-07 *** ## X4 -2.383e-02 4.767e-03 -4.999 5.79e-07 *** ## X5 1.411e-03 2.748e-04 5.132 2.88e-07 *** ## X6 9.718e-02 2.741e-03 35.455 < 2e-16 *** ## X7 2.020e-02 3.306e-03 6.110 1.01e-09 *** ## X8 1.779e-02 2.983e-03 5.964 2.49e-09 *** ## X12 -4.507e-07 3.462e-08 -13.017 < 2e-16 *** ## X18 -7.533e-07 1.406e-07 -5.357 8.54e-08 *** ## X22 -3.504e-07 1.523e-07 -2.300 0.02143 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.3887 on 29988 degrees of freedom ## Multiple R-squared: 0.1234, Adjusted R-squared: 0.123 ## F-statistic: 383.6 on 11 and 29988 DF, p-value: < 2.2e-16
#Regresión logística GLMcredit <- glm(Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + X18 + X22, family=binomial, data=bd) summary(GLMcredit)
## ## Call: ## glm(formula = Y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X12 + ## X18 + X22, family = binomial, data = bd) ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -7.163e-01 1.185e-01 -6.047 1.48e-09 *** ## X1 -9.879e-07 1.518e-07 -6.509 7.58e-11 *** ## X2 -1.027e-01 3.063e-02 -3.352 0.000803 *** ## X3 -1.055e-01 2.088e-02 -5.050 4.42e-07 *** ## X4 -1.573e-01 3.166e-02 -4.968 6.77e-07 *** ## X5 7.664e-03 1.777e-03 4.312 1.62e-05 *** ## X6 5.960e-01 1.752e-02 34.018 < 2e-16 *** ## X7 7.974e-02 1.991e-02 4.004 6.22e-05 *** ## X8 1.284e-01 1.837e-02 6.988 2.79e-12 *** ## X12 -1.891e-06 2.621e-07 -7.216 5.34e-13 *** ## X18 -1.346e-05 2.111e-06 -6.378 1.79e-10 *** ## X22 -4.583e-06 1.529e-06 -2.998 0.002718 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 31705 on 29999 degrees of freedom ## Residual deviance: 27956 on 29988 degrees of freedom ## AIC: 27980 ## ## Number of Fisher Scoring iterations: 5
# Suponiendo que 'new_data' es tu nuevo conjunto de datos con valores reales para las variables X1, X2, ..., X22. # Deberías crear este data.frame con los datos reales que quieres probar. new_data <- data.frame( X1 = c(50000), # Ejemplo de monto de crédito X2 = c(1), # Ejemplo de género X3 = c(2), # Ejemplo de educación X4 = c(1), # Ejemplo de estado civil X5 = c(57), # Ejemplo de edad X6 = c(-1), # Ejemplo de historial de pagos X7 = c(0), # Continuar con datos reales para cada variable... X8 = c(-1), X12 = c(8617), X18 = c(2000), X22 = c(689) ) # Utilizar el modelo para hacer predicciones probabilidades <- predict(GLMcredit, newdata = new_data, type = "response") # Las probabilidades te darán la probabilidad predicha de incumplimiento para cada observación en 'new_data' print(probabilidades)
## 1 ## 0.1721439