#Cargue Librerias
library(readxl)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(CGPfunctions)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
library(car)
## Loading required package: carData
datos2 = read_excel("G:/ACADEMIA/JAVERIANA CALI/2. SEMESTRE 2022 - II/2. Met. Estad. para la Toma Decisiones/Logistica/Datos_Creditos.xlsx")
head(datos2)
## # A tibble: 6 × 5
## DEFAULT ANTIGUEDAD EDAD CUOTA_TOTAL INGRESOS
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 37.3 77.0 3020519 8155593
## 2 1 37.3 73.8 1766552 6181263
## 3 1 31.0 78.9 1673786 4328075
## 4 1 9.73 51.5 668479 5290910
## 5 1 8.44 39.0 1223559 5333818
## 6 1 6.61 44.9 3517756 2710736
Creacion del dataset train y test
nrow(datos2)
## [1] 780
ntrain <- nrow(datos2)*0.8
ntest <- nrow(datos2)*0.2
c(ntrain,ntest)
## [1] 624 156
set.seed(740)
index_train<-sample(1:nrow(datos2),size = ntrain)
train<-datos2[index_train,]
test<-datos2[-index_train,]
summary(train)
## DEFAULT ANTIGUEDAD EDAD CUOTA_TOTAL
## Min. :0.00000 Min. : 0.2548 Min. :26.61 Min. : 387
## 1st Qu.:0.00000 1st Qu.: 7.4884 1st Qu.:47.65 1st Qu.: 329083
## Median :0.00000 Median :14.8616 Median :57.05 Median : 658548
## Mean :0.04968 Mean :17.8888 Mean :56.58 Mean : 874371
## 3rd Qu.:0.00000 3rd Qu.:30.4431 3rd Qu.:65.82 3rd Qu.:1233921
## Max. :1.00000 Max. :37.3178 Max. :92.43 Max. :6664588
## INGRESOS
## Min. : 701758
## 1st Qu.: 3494622
## Median : 4964190
## Mean : 5317859
## 3rd Qu.: 6817749
## Max. :22197021
summary(test)
## DEFAULT ANTIGUEDAD EDAD CUOTA_TOTAL
## Min. :0.00000 Min. : 0.5041 Min. :28.97 Min. : 24115
## 1st Qu.:0.00000 1st Qu.: 6.8897 1st Qu.:49.80 1st Qu.: 315632
## Median :0.00000 Median :16.6069 Median :59.22 Median : 827354
## Mean :0.05128 Mean :18.6212 Mean :58.62 Mean : 928545
## 3rd Qu.:0.00000 3rd Qu.:31.5562 3rd Qu.:68.52 3rd Qu.:1363527
## Max. :1.00000 Max. :37.3178 Max. :91.64 Max. :3788917
## INGRESOS
## Min. : 633825
## 1st Qu.: 3782509
## Median : 5213904
## Mean : 5560716
## 3rd Qu.: 6902757
## Max. :19548379
Construccion modelo sobre train
#estimacion del modelo
mod_glm_train <- glm(DEFAULT ~ ANTIGUEDAD + EDAD + CUOTA_TOTAL + INGRESOS,
data = train, family = "binomial")
summary(mod_glm_train)
##
## Call:
## glm(formula = DEFAULT ~ ANTIGUEDAD + EDAD + CUOTA_TOTAL + INGRESOS,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7367 -0.3624 -0.2963 -0.2186 2.9830
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.890e+00 1.058e+00 -2.730 0.006328 **
## ANTIGUEDAD -2.555e-02 2.635e-02 -0.970 0.332199
## EDAD 1.116e-02 2.249e-02 0.496 0.619871
## CUOTA_TOTAL 8.541e-07 2.583e-07 3.307 0.000944 ***
## INGRESOS -2.258e-07 1.142e-07 -1.977 0.048017 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 246.57 on 623 degrees of freedom
## Residual deviance: 233.82 on 619 degrees of freedom
## AIC: 243.82
##
## Number of Fisher Scoring iterations: 6
step_train<- stepAIC(mod_glm_train, direction="both")
## Start: AIC=243.82
## DEFAULT ~ ANTIGUEDAD + EDAD + CUOTA_TOTAL + INGRESOS
##
## Df Deviance AIC
## - EDAD 1 234.06 242.06
## - ANTIGUEDAD 1 234.74 242.74
## <none> 233.82 243.82
## - INGRESOS 1 238.32 246.32
## - CUOTA_TOTAL 1 244.34 252.34
##
## Step: AIC=242.06
## DEFAULT ~ ANTIGUEDAD + CUOTA_TOTAL + INGRESOS
##
## Df Deviance AIC
## - ANTIGUEDAD 1 234.81 240.81
## <none> 234.06 242.06
## + EDAD 1 233.82 243.82
## - INGRESOS 1 238.44 244.44
## - CUOTA_TOTAL 1 244.40 250.40
##
## Step: AIC=240.81
## DEFAULT ~ CUOTA_TOTAL + INGRESOS
##
## Df Deviance AIC
## <none> 234.81 240.81
## + ANTIGUEDAD 1 234.06 242.06
## + EDAD 1 234.74 242.74
## - INGRESOS 1 241.90 245.90
## - CUOTA_TOTAL 1 244.69 248.69
#CALCULO ROC Y AUC sobre test
# estimamos la probabilidad
default_prob_test <- predict(step_train, type = "response", newdata = test)
# Pintamos ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
ROC <- roc(test$DEFAULT, default_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC, col = "red")
#area bajo la curva sobre test
auc(ROC)
## Area under the curve: 0.7745
se obtiene un desempeño buen del modelo, validado sobre la muestra de test
#construccion matriz de confusion
library(vcd)
## Loading required package: grid
predicciones <- ifelse(test = step_train$fitted.values > 0.04, yes = 1, no = 0)
matriz_confusion <- table(step_train$model$DEFAULT, predicciones,
dnn = c("observaciones", "predicciones"))
matriz_confusion
## predicciones
## observaciones 0 1
## 0 242 351
## 1 9 22
summary(step_train)
##
## Call:
## glm(formula = DEFAULT ~ CUOTA_TOTAL + INGRESOS, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6720 -0.3578 -0.2964 -0.2285 2.8994
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.488e+00 4.425e-01 -5.622 1.89e-08 ***
## CUOTA_TOTAL 8.197e-07 2.552e-07 3.212 0.00132 **
## INGRESOS -2.602e-07 1.070e-07 -2.431 0.01506 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 246.57 on 623 degrees of freedom
## Residual deviance: 234.81 on 621 degrees of freedom
## AIC: 240.81
##
## Number of Fisher Scoring iterations: 6
Se obtiene un modelo consistente, se reducen variables via stepwise, obteniendo un AUC de 77%. El punto de corte debe bajarse significativamente, teniendo en cuenta que la tasa de respuesta es baja, sin embargo esto implica que la distribuciónn de la probabilidad esta muy sesgada.
hist(step_train$fitted.values, main = "Distribucion de las probabilidades calculadas",
xlab = "Probabilidad")
Metricas de Desempeño
n = sum(matriz_confusion) # number of instances
nc = nrow(matriz_confusion) # number of classes
diag = diag(matriz_confusion) # number of correctly classified instances per class
rowsums = apply(matriz_confusion, 1, sum) # number of instances per class
colsums = apply(matriz_confusion, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # d
accuracy = sum(diag) / n
accuracy
## [1] 0.4230769
precision = diag / colsums
recall = diag / rowsums
f1 = 2 * precision * recall / (precision + recall)
data.frame(precision, recall, f1)
## precision recall f1
## 0 0.96414343 0.4080944 0.5734597
## 1 0.05898123 0.7096774 0.1089109
macroPrecision = mean(precision)
macroRecall = mean(recall)
macroF1 = mean(f1)
data.frame(macroPrecision, macroRecall, macroF1)
## macroPrecision macroRecall macroF1
## 1 0.5115623 0.5588859 0.3411853