library(readxl)
vencimientos <- read_excel("C:/Users/Jamileth/Documents/MAESTRÍA 2/TFM/bases/vencimientos.xlsx")
bddv<-vencimientos[vencimientos$PRODUCTO=="DEPOSITO PLAZO FIJO",]
table(bddv$RENOVADO)
##
## NO SI
## 3971 17043
bddv$RENOVADO <- ifelse (bddv$RENOVADO =="SI",1,0)
table(bddv$RENOVADO)
##
## 0 1
## 3971 17043
prop.table(table(bddv$RENOVADO))
##
## 0 1
## 0.1889693 0.8110307
library(caTools)
set.seed(12345)
division <- sample.split(bddv$RENOVADO, SplitRatio = 0.8)
entrenamiento <- subset (bddv, division==TRUE)
validacion <- subset (bddv, division==FALSE)
prop.table(table(entrenamiento$RENOVADO))
##
## 0 1
## 0.1889834 0.8110166
prop.table(table(validacion$RENOVADO))
##
## 0 1
## 0.1889127 0.8110873
library (rpart)
library (rpart.plot)
modeloarbol <- rpart (RENOVADO ~ CAPITAL + PLAZO + TASA + EDAD + as.factor(MES)+ as.factor(QUITO)+ NUMERORENOVACIONES +
as.factor(FRECUENCIAPAGOINT)+ as.factor(BANDAMADURACION) + PIGNORADO + as.factor(OFICINA),
data = entrenamiento, method = "class")
summary(modeloarbol)
## Call:
## rpart(formula = RENOVADO ~ CAPITAL + PLAZO + TASA + EDAD + as.factor(MES) +
## as.factor(QUITO) + NUMERORENOVACIONES + as.factor(FRECUENCIAPAGOINT) +
## as.factor(BANDAMADURACION) + PIGNORADO + as.factor(OFICINA),
## data = entrenamiento, method = "class")
## n= 16811
##
## CP nsplit rel error xerror xstd
## 1 0.14604973 0 1.0000000 1.0000000 0.01597741
## 2 0.01164621 1 0.8539503 0.8611898 0.01506497
## 3 0.01000000 2 0.8423041 0.8479698 0.01497118
##
## Variable importance
## CAPITAL PLAZO as.factor(OFICINA) as.factor(QUITO)
## 72 15 11 2
##
## Node number 1: 16811 observations, complexity param=0.1460497
## predicted class=1 expected loss=0.1889834 P(node) =1
## class counts: 3177 13634
## probabilities: 0.189 0.811
## left son=2 (974 obs) right son=3 (15837 obs)
## Primary splits:
## CAPITAL < 477.355 to the left, improve=623.7145, (0 missing)
## TASA < 0.07775 to the left, improve=307.4560, (0 missing)
## as.factor(OFICINA) splits as RRRRRRRLRRRLLRRRRLRRRLLRRRRRRRLLRR, improve=274.9510, (0 missing)
## PLAZO < 745 to the right, improve=229.6481, (0 missing)
## as.factor(QUITO) splits as LR, improve=105.8979, (0 missing)
## Surrogate splits:
## PLAZO < 745 to the right, agree=0.954, adj=0.211, (0 split)
## as.factor(OFICINA) splits as RRRRRRRRRRRRRRRRRRRRRLRRRRRRRRRRRR, agree=0.947, adj=0.088, (0 split)
##
## Node number 2: 974 observations, complexity param=0.01164621
## predicted class=0 expected loss=0.261807 P(node) =0.05793825
## class counts: 719 255
## probabilities: 0.738 0.262
## left son=4 (875 obs) right son=5 (99 obs)
## Primary splits:
## as.factor(OFICINA) splits as LL-LRLRLLLLLLR-R---RLLLLRLRLR--LLR, improve=39.82172, (0 missing)
## as.factor(QUITO) splits as LR, improve=26.21216, (0 missing)
## as.factor(BANDAMADURACION) splits as RRLRR, improve=13.37964, (0 missing)
## CAPITAL < 371.81 to the left, improve=13.29754, (0 missing)
## as.factor(MES) splits as RLLRRLLRRLLL, improve=12.79186, (0 missing)
## Surrogate splits:
## as.factor(QUITO) splits as LR, agree=0.946, adj=0.465, (0 split)
## as.factor(FRECUENCIAPAGOINT) splits as LR, agree=0.906, adj=0.071, (0 split)
##
## Node number 3: 15837 observations
## predicted class=1 expected loss=0.1552062 P(node) =0.9420617
## class counts: 2458 13379
## probabilities: 0.155 0.845
##
## Node number 4: 875 observations
## predicted class=0 expected loss=0.2137143 P(node) =0.05204925
## class counts: 688 187
## probabilities: 0.786 0.214
##
## Node number 5: 99 observations
## predicted class=1 expected loss=0.3131313 P(node) =0.005889001
## class counts: 31 68
## probabilities: 0.313 0.687
#MODELO LOGIT
modelogit<- glm (RENOVADO ~ CAPITAL + PLAZO + TASA +as.factor(QUITO)+ NUMERORENOVACIONES + as.factor(BANDAMADURACION) + as.factor(PIGNORADO), data = bddv, family ="binomial" (link = "logit"))
summary(modelogit)
##
## Call:
## glm(formula = RENOVADO ~ CAPITAL + PLAZO + TASA + as.factor(QUITO) +
## NUMERORENOVACIONES + as.factor(BANDAMADURACION) + as.factor(PIGNORADO),
## family = binomial(link = "logit"), data = bddv)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -4.730e+00 2.444e-01 -19.352
## CAPITAL -1.251e-06 2.052e-07 -6.094
## PLAZO -2.790e-03 1.390e-04 -20.072
## TASA 7.164e+01 1.894e+00 37.814
## as.factor(QUITO)SI 5.003e-01 3.863e-02 12.951
## NUMERORENOVACIONES 1.631e-01 1.353e-02 12.051
## as.factor(BANDAMADURACION)DE 1 A 30 DIAS 1.164e+00 2.030e-01 5.732
## as.factor(BANDAMADURACION)DE 181 A 360 DIAS 8.564e-01 1.979e-01 4.327
## as.factor(BANDAMADURACION)DE 31 A 90 DIAS 1.073e+00 2.004e-01 5.357
## as.factor(BANDAMADURACION)DE 91 A 180 DIAS 9.292e-01 1.991e-01 4.668
## as.factor(PIGNORADO)SI -1.309e+00 8.330e-02 -15.719
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## CAPITAL 1.10e-09 ***
## PLAZO < 2e-16 ***
## TASA < 2e-16 ***
## as.factor(QUITO)SI < 2e-16 ***
## NUMERORENOVACIONES < 2e-16 ***
## as.factor(BANDAMADURACION)DE 1 A 30 DIAS 9.93e-09 ***
## as.factor(BANDAMADURACION)DE 181 A 360 DIAS 1.51e-05 ***
## as.factor(BANDAMADURACION)DE 31 A 90 DIAS 8.45e-08 ***
## as.factor(BANDAMADURACION)DE 91 A 180 DIAS 3.04e-06 ***
## as.factor(PIGNORADO)SI < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20372 on 21013 degrees of freedom
## Residual deviance: 17852 on 21003 degrees of freedom
## AIC: 17874
##
## Number of Fisher Scoring iterations: 5
predicentrenamiento <- predict(modelogit, newdata = entrenamiento, type = "response")
predicentrenamiento2 <- ifelse (predicentrenamiento > 0.5, 1, 0)
library (caret)
## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
confusionMatrix(as.factor(entrenamiento$RENOVADO), as.factor(predicentrenamiento2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 615 2562
## 1 352 13282
##
## Accuracy : 0.8267
## 95% CI : (0.8209, 0.8324)
## No Information Rate : 0.9425
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2288
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.63599
## Specificity : 0.83830
## Pos Pred Value : 0.19358
## Neg Pred Value : 0.97418
## Prevalence : 0.05752
## Detection Rate : 0.03658
## Detection Prevalence : 0.18898
## Balanced Accuracy : 0.73714
##
## 'Positive' Class : 0
##
prec <- posPredValue(as.factor(entrenamiento$RENOVADO), as.factor(predicentrenamiento2), positive = "1")
recall <- sensitivity(as.factor(entrenamiento$RENOVADO), as.factor(predicentrenamiento2), positive = "1")
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.4.3
##
## Adjuntando el paquete: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
F1_Score(y_pred=as.factor(entrenamiento$RENOVADO), y_true=as.factor(predicentrenamiento2), positive="1")
## [1] 0.9011466
devianza <- deviance(modelogit)
devianza
## [1] 17851.6
modelo_nulo <-glm (RENOVADO ~ 1, data = bddv, family ="binomial")
devianza <- deviance(modelo_nulo)
devianza
## [1] 20372.02