Reg Logistica - Creditos

#Cargue Librerias
library(readxl)
library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(CGPfunctions)
library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Loading required package: RSQLite

library(caret)

## Loading required package: lattice

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-4

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:plotly':
## 
##     select

library(car)

## Loading required package: carData

datos2 = read_excel("G:/ACADEMIA/JAVERIANA CALI/2. SEMESTRE 2022 - II/2. Met. Estad. para la Toma Decisiones/Logistica/Datos_Creditos.xlsx")
head(datos2)

## # A tibble: 6 × 5
##   DEFAULT ANTIGUEDAD  EDAD CUOTA_TOTAL INGRESOS
##     <dbl>      <dbl> <dbl>       <dbl>    <dbl>
## 1       1      37.3   77.0     3020519  8155593
## 2       1      37.3   73.8     1766552  6181263
## 3       1      31.0   78.9     1673786  4328075
## 4       1       9.73  51.5      668479  5290910
## 5       1       8.44  39.0     1223559  5333818
## 6       1       6.61  44.9     3517756  2710736

Creacion del dataset train y test

nrow(datos2)

## [1] 780

ntrain <- nrow(datos2)*0.8
ntest <- nrow(datos2)*0.2
c(ntrain,ntest)

## [1] 624 156

set.seed(740)
index_train<-sample(1:nrow(datos2),size = ntrain)
train<-datos2[index_train,]
test<-datos2[-index_train,]

summary(train)

##     DEFAULT          ANTIGUEDAD           EDAD        CUOTA_TOTAL     
##  Min.   :0.00000   Min.   : 0.2548   Min.   :26.61   Min.   :    387  
##  1st Qu.:0.00000   1st Qu.: 7.4884   1st Qu.:47.65   1st Qu.: 329083  
##  Median :0.00000   Median :14.8616   Median :57.05   Median : 658548  
##  Mean   :0.04968   Mean   :17.8888   Mean   :56.58   Mean   : 874371  
##  3rd Qu.:0.00000   3rd Qu.:30.4431   3rd Qu.:65.82   3rd Qu.:1233921  
##  Max.   :1.00000   Max.   :37.3178   Max.   :92.43   Max.   :6664588  
##     INGRESOS       
##  Min.   :  701758  
##  1st Qu.: 3494622  
##  Median : 4964190  
##  Mean   : 5317859  
##  3rd Qu.: 6817749  
##  Max.   :22197021

summary(test)

##     DEFAULT          ANTIGUEDAD           EDAD        CUOTA_TOTAL     
##  Min.   :0.00000   Min.   : 0.5041   Min.   :28.97   Min.   :  24115  
##  1st Qu.:0.00000   1st Qu.: 6.8897   1st Qu.:49.80   1st Qu.: 315632  
##  Median :0.00000   Median :16.6069   Median :59.22   Median : 827354  
##  Mean   :0.05128   Mean   :18.6212   Mean   :58.62   Mean   : 928545  
##  3rd Qu.:0.00000   3rd Qu.:31.5562   3rd Qu.:68.52   3rd Qu.:1363527  
##  Max.   :1.00000   Max.   :37.3178   Max.   :91.64   Max.   :3788917  
##     INGRESOS       
##  Min.   :  633825  
##  1st Qu.: 3782509  
##  Median : 5213904  
##  Mean   : 5560716  
##  3rd Qu.: 6902757  
##  Max.   :19548379

Construccion modelo sobre train

#estimacion del modelo
mod_glm_train <- glm(DEFAULT ~ ANTIGUEDAD + EDAD + CUOTA_TOTAL + INGRESOS, 
                data = train, family = "binomial")
summary(mod_glm_train)

## 
## Call:
## glm(formula = DEFAULT ~ ANTIGUEDAD + EDAD + CUOTA_TOTAL + INGRESOS, 
##     family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7367  -0.3624  -0.2963  -0.2186   2.9830  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.890e+00  1.058e+00  -2.730 0.006328 ** 
## ANTIGUEDAD  -2.555e-02  2.635e-02  -0.970 0.332199    
## EDAD         1.116e-02  2.249e-02   0.496 0.619871    
## CUOTA_TOTAL  8.541e-07  2.583e-07   3.307 0.000944 ***
## INGRESOS    -2.258e-07  1.142e-07  -1.977 0.048017 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 246.57  on 623  degrees of freedom
## Residual deviance: 233.82  on 619  degrees of freedom
## AIC: 243.82
## 
## Number of Fisher Scoring iterations: 6

step_train<- stepAIC(mod_glm_train, direction="both")

## Start:  AIC=243.82
## DEFAULT ~ ANTIGUEDAD + EDAD + CUOTA_TOTAL + INGRESOS
## 
##               Df Deviance    AIC
## - EDAD         1   234.06 242.06
## - ANTIGUEDAD   1   234.74 242.74
## <none>             233.82 243.82
## - INGRESOS     1   238.32 246.32
## - CUOTA_TOTAL  1   244.34 252.34
## 
## Step:  AIC=242.06
## DEFAULT ~ ANTIGUEDAD + CUOTA_TOTAL + INGRESOS
## 
##               Df Deviance    AIC
## - ANTIGUEDAD   1   234.81 240.81
## <none>             234.06 242.06
## + EDAD         1   233.82 243.82
## - INGRESOS     1   238.44 244.44
## - CUOTA_TOTAL  1   244.40 250.40
## 
## Step:  AIC=240.81
## DEFAULT ~ CUOTA_TOTAL + INGRESOS
## 
##               Df Deviance    AIC
## <none>             234.81 240.81
## + ANTIGUEDAD   1   234.06 242.06
## + EDAD         1   234.74 242.74
## - INGRESOS     1   241.90 245.90
## - CUOTA_TOTAL  1   244.69 248.69

#CALCULO ROC Y AUC sobre test

# estimamos la probabilidad
default_prob_test <- predict(step_train, type = "response", newdata = test)

# Pintamos  ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

ROC <- roc(test$DEFAULT, default_prob_test)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(ROC, col = "red")

#area bajo la curva sobre test
auc(ROC)

## Area under the curve: 0.7745

se obtiene un desempeño buen del modelo, validado sobre la muestra de test

#construccion matriz de confusion
library(vcd)

## Loading required package: grid

predicciones <- ifelse(test = step_train$fitted.values > 0.04, yes = 1, no = 0)

matriz_confusion <- table(step_train$model$DEFAULT, predicciones,
          dnn = c("observaciones", "predicciones"))
matriz_confusion

##              predicciones
## observaciones   0   1
##             0 242 351
##             1   9  22

summary(step_train)

## 
## Call:
## glm(formula = DEFAULT ~ CUOTA_TOTAL + INGRESOS, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6720  -0.3578  -0.2964  -0.2285   2.8994  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.488e+00  4.425e-01  -5.622 1.89e-08 ***
## CUOTA_TOTAL  8.197e-07  2.552e-07   3.212  0.00132 ** 
## INGRESOS    -2.602e-07  1.070e-07  -2.431  0.01506 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 246.57  on 623  degrees of freedom
## Residual deviance: 234.81  on 621  degrees of freedom
## AIC: 240.81
## 
## Number of Fisher Scoring iterations: 6

Se obtiene un modelo consistente, se reducen variables via stepwise, obteniendo un AUC de 77%. El punto de corte debe bajarse significativamente, teniendo en cuenta que la tasa de respuesta es baja, sin embargo esto implica que la distribuciónn de la probabilidad esta muy sesgada.

hist(step_train$fitted.values, main = "Distribucion de las probabilidades calculadas",
     xlab = "Probabilidad")

Metricas de Desempeño

 n = sum(matriz_confusion) # number of instances
 nc = nrow(matriz_confusion) # number of classes
 diag = diag(matriz_confusion) # number of correctly classified instances per class 
 rowsums = apply(matriz_confusion, 1, sum) # number of instances per class
 colsums = apply(matriz_confusion, 2, sum) # number of predictions per class
 p = rowsums / n # distribution of instances over the actual classes
 q = colsums / n # d

 accuracy = sum(diag) / n 
 accuracy

## [1] 0.4230769

 precision = diag / colsums 
 recall = diag / rowsums 
 f1 = 2 * precision * recall / (precision + recall)

 data.frame(precision, recall, f1)

##    precision    recall        f1
## 0 0.96414343 0.4080944 0.5734597
## 1 0.05898123 0.7096774 0.1089109

  macroPrecision = mean(precision)
  macroRecall = mean(recall)
  macroF1 = mean(f1)

  data.frame(macroPrecision, macroRecall, macroF1)

##   macroPrecision macroRecall   macroF1
## 1      0.5115623   0.5588859 0.3411853

Reg Logistica - Creditos

Juan Pablo Gomez Bravo

30/9/2022