library(paqueteMOD)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
require(ggpubr)
## Loading required package: ggpubr
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ROSE)
## Loaded ROSE 0.0-4
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
require(CGPfunctions)
## Loading required package: CGPfunctions
data("creditos")
glimpse(creditos)
## Rows: 780
## Columns: 5
## $ DEFAULT <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ ANTIUEDAD <dbl> 37.317808, 37.317808, 30.978082, 9.728767, 8.443836, 6.605…
## $ EDAD <dbl> 76.98356, 73.77534, 78.93699, 51.52877, 38.96986, 44.87945…
## $ CUOTA_TOTAL <dbl> 3020519, 1766552, 1673786, 668479, 1223559, 3517756, 13047…
## $ INGRESOS <dbl> 8155593, 6181263, 4328075, 5290910, 5333818, 2710736, 3169…
g3=ggplot(creditos,aes(x=INGRESOS))+geom_histogram(binwidth = 1000000)+theme_bw()
ggarrange(g3,labels = c("Ingresos"),ncol = 1, nrow = 1)
boxplot(creditos$INGRESOS)
summary(creditos$INGRESOS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 633825 3583324 5038962 5366430 6844098 22197021
sd(creditos$INGRESOS)
## [1] 2652186
g1=ggplot(creditos,aes(x=CUOTA_TOTAL))+geom_histogram(binwidth = 100000)+theme_bw()+scale_fill_gradient("Count", low = "green", high = "red")
ggarrange(g1,labels = c("Cuota Total"),ncol = 1, nrow = 1)
boxplot(creditos$CUOTA_TOTAL)
summary(creditos$CUOTA_TOTAL)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 387 328516 694460 885206 1244126 6664588
sd(creditos$CUOTA_TOTAL)
## [1] 740212.3
modelo_credito=glm(DEFAULT ~ INGRESOS
+ CUOTA_TOTAL,
family = binomial(link="logit"),
data=creditos)
summary(modelo_credito)
##
## Call:
## glm(formula = DEFAULT ~ INGRESOS + CUOTA_TOTAL, family = binomial(link = "logit"),
## data = creditos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6901 -0.3648 -0.2928 -0.2113 2.9753
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.353e+00 3.966e-01 -5.933 2.97e-09 ***
## INGRESOS -3.134e-07 1.005e-07 -3.119 0.001817 **
## CUOTA_TOTAL 9.341e-07 2.404e-07 3.885 0.000102 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 309.68 on 779 degrees of freedom
## Residual deviance: 291.37 on 777 degrees of freedom
## AIC: 297.37
##
## Number of Fisher Scoring iterations: 6
estadistico_X2 = modelo_credito$null.deviance - modelo_credito$deviance
estadistico_X2
## [1] 18.31251
gl = modelo_credito$df.null - modelo_credito$df.residual
chi_result=pchisq(estadistico_X2, df = gl, lower.tail = FALSE)
chi_result
## [1] 0.0001055576
1-chi_result
## [1] 0.9998944
n_train <- nrow(creditos)*0.6
n_test <- nrow(creditos)*0.4
set.seed(123)
index_train<-sample(1:nrow(creditos),size = n_train)
#Entrenamiento
train<-creditos[index_train,]
# Test
test<-creditos[-index_train,]
#Revisión de balanceo de los datos en los sets de train y test
prop.table(table(train$DEFAULT))
##
## 0 1
## 0.94871795 0.05128205
prop.table(table(test$DEFAULT))
##
## 0 1
## 0.95192308 0.04807692
#Balanceo por oversampling
train_b=ovun.sample(DEFAULT~., data = train,
p = 0.5, seed = 1,
method = "over")$data
test_b=ovun.sample(DEFAULT~., data = test,
p = 0.5, seed = 1,
method = "over")$data
#Datos Balanceados
prop.table(table(train_b$DEFAULT))
##
## 0 1
## 0.5068493 0.4931507
modelo_creditos_2=glm(DEFAULT ~ CUOTA_TOTAL
+ INGRESOS,
family = binomial(link="logit"),
data=train_b)
# Matriz de Confusión para evaluación de desempeño
valor_pronosticado.b <- predict(modelo_creditos_2,test_b,type = "response")
niveles_pronosticados.b <- ifelse(valor_pronosticado.b > 0.5,1,0)
niveles_pronosticados.b <-factor(niveles_pronosticados.b)
rendimiento_data <- data.frame(observados = test_b$DEFAULT,
predicciones =niveles_pronosticados.b)
Positivos <- sum(rendimiento_data$observados == 1)
Negativos <- sum(rendimiento_data$observados == 0)
Positivos_pronosticados <- sum(rendimiento_data$predicciones ==1)
Negativos_pronosticados <- sum(rendimiento_data$predicciones == 0)
Total <- nrow(rendimiento_data)
VP <- sum(rendimiento_data$observados == 1 & rendimiento_data$predicciones == 1)
VN <- sum(rendimiento_data$observados == 0 & rendimiento_data$predicciones == 0)
FP <- sum(rendimiento_data$observados == 0 & rendimiento_data$predicciones == 1)
FN <- sum(rendimiento_data$observados == 1 & rendimiento_data$predicciones == 0)
matriz_confusion = matrix(c(VP, FP, FN,VN), nrow = 2)
#Sí para DEFAULT == 1 y NO para DEFAULT == 0
rownames(matriz_confusion) = c(" Si ", " No ")
colnames(matriz_confusion) = c("Si", "No")
matriz_confusion
## Si No
## Si 180 104
## No 112 185
curva_ROC <- roc(test_b$DEFAULT, valor_pronosticado.b)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc<- round(auc(curva_ROC, levels =c(0,1), direction = "<"),4) # 0.9177
ggroc(curva_ROC, colour = "#FF7F00", size=1)+
ggtitle(paste0("Curva ROC ", "(AUC = ", auc, ")"))+
xlab("Especificidad")+
ylab("Sensibilidad")
Se puede considerar que el modelo tiene una alta capacidad de clasificar correctamente los valores positivos para el DEFAULT, así como los valores negativos. El AUC corresponde a 0.693.
predict(modelo_creditos_2,list(CUOTA_TOTAL=12000000,
INGRESOS=3000000,
type="response"))
## 1
## 10.37083