library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggplot2)
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:survival':
##
## solder
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(rpart.plot)
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
library(nnet)
#Cargamos la data desde GITHUB
morosidad<-read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Mineria-de-datos/master/Morosidad.csv",sep=";")
#Observar los nombres de los atributos
names(morosidad)
## [1] "NRO_VEC_COB" "ESTADO_PDP" "NRO_CUOTAS"
## [4] "MES" "MES_0" "MES_1"
## [7] "MES_2" "FECHALLAMADA" "HORA"
## [10] "RESULTADOLLAMADA" "DEUDA_TOTAL" "ESTATUS"
## [13] "ACTIVACION" "TIPOCONTACTO" "MORA"
#Observamos los 6 primeros registros de los atributos
head(morosidad)
## NRO_VEC_COB ESTADO_PDP NRO_CUOTAS MES MES_0 MES_1 MES_2
## 1 >10 0 <24, 48] Febrero 244.86 0.00 245.26
## 2 <=10 0 >48 Febrero 610.58 612.69 611.54
## 3 <=10 0 <24, 48] Febrero 2138.73 0.00 0.00
## 4 >10 0 <24, 48] Febrero 323.07 46.85 74.26
## 5 <=10 0 <24, 48] Febrero 920.54 0.00 0.00
## 6 <=10 0 <24, 48] Febrero 364.27 371.11 0.00
## FECHALLAMADA HORA RESULTADOLLAMADA DEUDA_TOTAL ESTATUS ACTIVACION
## 1 3/02/2014 15 PDP Promesa De Pago 3816.34 BT 2012
## 2 18/02/2014 9 PDP Promesa De Pago 13620.50 BT 2012
## 3 5/02/2014 16 TELF. NO CORRESPONDE 18968.50 BT 2013
## 4 24/02/2014 19 MCT Msj Con Terceros 2459.84 BT 2012
## 5 12/02/2014 14 TELF. NO CORRESPONDE 6591.76 BT 2013
## 6 6/02/2014 7 MCF Msj Con Familiar 9836.38 BT 2013
## TIPOCONTACTO MORA
## 1 CEF 1
## 2 CEF 1
## 3 CNE 0
## 4 CNE 0
## 5 CNE 0
## 6 CNE 1
#Revisamos la estructura de los datos
str(morosidad)
## 'data.frame': 8458 obs. of 15 variables:
## $ NRO_VEC_COB : Factor w/ 2 levels "<=10",">10": 2 1 1 2 1 1 1 2 1 2 ...
## $ ESTADO_PDP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NRO_CUOTAS : Factor w/ 3 levels "<=24","<24, 48]",..: 2 3 2 2 2 2 3 3 3 3 ...
## $ MES : Factor w/ 3 levels "Abril","Febrero",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ MES_0 : num 245 611 2139 323 921 ...
## $ MES_1 : num 0 612.7 0 46.9 0 ...
## $ MES_2 : num 245.3 611.5 0 74.3 0 ...
## $ FECHALLAMADA : Factor w/ 61 levels "1/04/2014","10/02/2014",..: 45 19 53 31 8 55 53 8 2 50 ...
## $ HORA : int 15 9 16 19 14 7 16 9 11 12 ...
## $ RESULTADOLLAMADA: Factor w/ 24 levels "CAN Cliente Cancelo",..: 16 16 24 12 24 9 23 9 21 12 ...
## $ DEUDA_TOTAL : num 3816 13620 18969 2460 6592 ...
## $ ESTATUS : Factor w/ 1 level "BT": 1 1 1 1 1 1 1 1 1 1 ...
## $ ACTIVACION : int 2012 2012 2013 2012 2013 2013 2013 2011 2013 2010 ...
## $ TIPOCONTACTO : Factor w/ 2 levels "CEF","CNE": 1 1 2 2 2 2 2 2 1 2 ...
## $ MORA : int 1 1 0 0 0 1 1 1 1 1 ...
morosidad$ESTADO_PDP<-as.factor(morosidad$ESTADO_PDP)
morosidad$FECHALLAMADA<-as.Date(morosidad$FECHALLAMADA, "%d/%M/%Y")
#Eliminando el atributo ESTATUS
morosidad$ESTATUS<-NULL
#Ytulizamos un condicional para especificar el valor del atributo
morosidad$MORA<-ifelse(morosidad$MORA==1,"NO","SI")
#Coercion para mora
morosidad$MORA<-as.factor(morosidad$MORA)
str(morosidad)
## 'data.frame': 8458 obs. of 14 variables:
## $ NRO_VEC_COB : Factor w/ 2 levels "<=10",">10": 2 1 1 2 1 1 1 2 1 2 ...
## $ ESTADO_PDP : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ NRO_CUOTAS : Factor w/ 3 levels "<=24","<24, 48]",..: 2 3 2 2 2 2 3 3 3 3 ...
## $ MES : Factor w/ 3 levels "Abril","Febrero",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ MES_0 : num 245 611 2139 323 921 ...
## $ MES_1 : num 0 612.7 0 46.9 0 ...
## $ MES_2 : num 245.3 611.5 0 74.3 0 ...
## $ FECHALLAMADA : Date, format: "2014-05-03" "2014-05-18" ...
## $ HORA : int 15 9 16 19 14 7 16 9 11 12 ...
## $ RESULTADOLLAMADA: Factor w/ 24 levels "CAN Cliente Cancelo",..: 16 16 24 12 24 9 23 9 21 12 ...
## $ DEUDA_TOTAL : num 3816 13620 18969 2460 6592 ...
## $ ACTIVACION : int 2012 2012 2013 2012 2013 2013 2013 2011 2013 2010 ...
## $ TIPOCONTACTO : Factor w/ 2 levels "CEF","CNE": 1 1 2 2 2 2 2 2 1 2 ...
## $ MORA : Factor w/ 2 levels "NO","SI": 1 1 2 2 2 1 1 1 1 1 ...
#Resumen de los datos
summary(morosidad)
## NRO_VEC_COB ESTADO_PDP NRO_CUOTAS MES MES_0
## <=10:5917 0:8128 <=24 :1077 Abril :2704 Min. : 20.15
## >10 :2541 1: 330 <24, 48]:5192 Febrero:2796 1st Qu.: 239.85
## >48 :2189 Marzo :2958 Median : 422.55
## Mean : 610.17
## 3rd Qu.: 762.36
## Max. :4905.99
##
## MES_1 MES_2 FECHALLAMADA HORA
## Min. : 0.0 Min. : 0.0 Min. :2014-05-01 Min. : 7.00
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:2014-05-06 1st Qu.: 9.00
## Median : 135.3 Median : 0.0 Median :2014-05-11 Median :11.00
## Mean : 382.4 Mean : 266.7 Mean :2014-05-13 Mean :12.15
## 3rd Qu.: 556.4 3rd Qu.: 311.1 3rd Qu.:2014-05-20 3rd Qu.:15.00
## Max. :5909.2 Max. :8791.9 Max. :2014-05-31 Max. :20.00
##
## RESULTADOLLAMADA DEUDA_TOTAL ACTIVACION
## MCT Msj Con Terceros :2678 Min. : 546.1 Min. :2010
## TELF. NO CORRESPONDE :1466 1st Qu.: 3710.6 1st Qu.:2012
## PDP Promesa De Pago :1446 Median : 7317.5 Median :2012
## MCF Msj Con Familiar :1334 Mean :11545.0 Mean :2012
## TAT Tratamiento al Titu: 518 3rd Qu.:14696.8 3rd Qu.:2013
## PAR Promesa Parcial : 203 Max. :79984.0 Max. :2014
## (Other) : 813
## TIPOCONTACTO MORA
## CEF:2484 NO:6440
## CNE:5974 SI:2018
##
##
##
##
##
#Realizamos un resumen con la libreria
describe(morosidad)
## morosidad
##
## 14 Variables 8458 Observations
## ---------------------------------------------------------------------------
## NRO_VEC_COB
## n missing distinct
## 8458 0 2
##
## Value <=10 >10
## Frequency 5917 2541
## Proportion 0.7 0.3
## ---------------------------------------------------------------------------
## ESTADO_PDP
## n missing distinct
## 8458 0 2
##
## Value 0 1
## Frequency 8128 330
## Proportion 0.961 0.039
## ---------------------------------------------------------------------------
## NRO_CUOTAS
## n missing distinct
## 8458 0 3
##
## Value <=24 <24, 48] >48
## Frequency 1077 5192 2189
## Proportion 0.127 0.614 0.259
## ---------------------------------------------------------------------------
## MES
## n missing distinct
## 8458 0 3
##
## Value Abril Febrero Marzo
## Frequency 2704 2796 2958
## Proportion 0.320 0.331 0.350
## ---------------------------------------------------------------------------
## MES_0
## n missing distinct Info Mean Gmd .05 .10
## 8458 0 6382 1 610.2 561.7 73.54 128.46
## .25 .50 .75 .90 .95
## 239.85 422.55 762.36 1330.62 1798.39
##
## lowest : 20.15 21.07 21.18 21.22 21.27
## highest: 4690.78 4749.85 4794.60 4876.55 4905.99
## ---------------------------------------------------------------------------
## MES_1
## n missing distinct Info Mean Gmd .05 .10
## 8458 0 3520 0.917 382.4 536 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 135.3 556.4 1076.8 1557.9
##
## lowest : 0.00 0.01 0.02 0.04 0.05
## highest: 5214.47 5243.34 5507.34 5833.31 5909.18
## ---------------------------------------------------------------------------
## MES_2
## n missing distinct Info Mean Gmd .05 .10
## 8458 0 2423 0.76 266.7 431.4 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 0.0 311.1 847.2 1244.9
##
## lowest : 0.00 0.09 0.10 0.12 0.17
## highest: 6665.19 6972.50 8023.21 8495.14 8791.91
## ---------------------------------------------------------------------------
## FECHALLAMADA
## n missing distinct
## 8458 0 31
##
## lowest : 2014-05-01 2014-05-02 2014-05-03 2014-05-04 2014-05-05
## highest: 2014-05-27 2014-05-28 2014-05-29 2014-05-30 2014-05-31
## ---------------------------------------------------------------------------
## HORA
## n missing distinct Info Mean Gmd .05 .10
## 8458 0 14 0.993 12.15 4.254 7 8
## .25 .50 .75 .90 .95
## 9 11 15 18 19
##
## Value 7 8 9 10 11 12 13 14 15 16
## Frequency 804 870 888 842 862 592 610 588 578 538
## Proportion 0.095 0.103 0.105 0.100 0.102 0.070 0.072 0.070 0.068 0.064
##
## Value 17 18 19 20
## Frequency 334 311 325 316
## Proportion 0.039 0.037 0.038 0.037
## ---------------------------------------------------------------------------
## RESULTADOLLAMADA
## n missing distinct
## 8458 0 24
##
## lowest : CAN Cliente Cancelo CIH Generar Cita CIT Genera Cita Otro Dia IPN Inbound Pdp de Neg IPP Inbound Promesa Par
## highest: REC Cliente con Reclamo RPP Recordar Promesa TAT Tratamiento al Titu TAT sin contacto con Titular TELF. NO CORRESPONDE
## ---------------------------------------------------------------------------
## DEUDA_TOTAL
## n missing distinct Info Mean Gmd .05 .10
## 8458 0 6590 1 11545 11615 1165 1896
## .25 .50 .75 .90 .95
## 3711 7318 14697 24994 36629
##
## lowest : 546.09 546.47 547.50 551.33 553.28
## highest: 79304.00 79489.00 79570.60 79695.90 79984.00
## ---------------------------------------------------------------------------
## ACTIVACION
## n missing distinct Info Mean Gmd
## 8458 0 5 0.865 2012 0.9287
##
## Value 2010 2011 2012 2013 2014
## Frequency 275 1387 2670 3908 218
## Proportion 0.033 0.164 0.316 0.462 0.026
## ---------------------------------------------------------------------------
## TIPOCONTACTO
## n missing distinct
## 8458 0 2
##
## Value CEF CNE
## Frequency 2484 5974
## Proportion 0.294 0.706
## ---------------------------------------------------------------------------
## MORA
## n missing distinct
## 8458 0 2
##
## Value NO SI
## Frequency 6440 2018
## Proportion 0.761 0.239
## ---------------------------------------------------------------------------
table(morosidad$MORA)
##
## NO SI
## 6440 2018
target<-table(morosidad$MORA)
barplot(target)
#Graficamos con la libreria ggplot2
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=MORA))
##Se observa claramente un conjunto de datos desbalanceados, esto afectara al aprendizaje del modelo
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=NRO_VEC_COB, color=MORA))
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=ESTADO_PDP, color=MORA))
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=NRO_CUOTAS, color=MORA))
ggplot(data=morosidad)+
geom_histogram(mapping = aes(x=MES_0, color=MORA))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=morosidad)+
geom_histogram(mapping = aes(x=MES_1, color=MORA))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=morosidad)+
geom_histogram(mapping = aes(x=MES_2, color=MORA))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=morosidad)+
geom_freqpoly(mapping = aes(x=FECHALLAMADA, color=MORA))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=morosidad)+
geom_density(mapping = aes(x=FECHALLAMADA, color=MORA))
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=HORA, color=MORA))
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=RESULTADOLLAMADA, color=MORA))+ coord_flip()
ggplot(data=morosidad)+
geom_histogram(mapping = aes(x=DEUDA_TOTAL, color=MORA))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=ACTIVACION, color=MORA))
ggplot(data=morosidad)+
geom_bar(mapping = aes(x=TIPOCONTACTO, color=MORA))
plot(morosidad[,5:11])
ggplot(data=morosidad)+
geom_point(mapping= aes(x=MES_0, y=MES_1, color=MORA))
ggplot(data=morosidad)+
geom_point(mapping= aes(x=MES_0, y=MES_2, color=MORA))
ggplot(data=morosidad)+
geom_point(mapping= aes(x=MES_1, y=MES_2, color=MORA))
cor(morosidad[,5:7])
## MES_0 MES_1 MES_2
## MES_0 1.0000000 0.4010593 0.1936789
## MES_1 0.4010593 1.0000000 0.1778086
## MES_2 0.1936789 0.1778086 1.0000000
muestra<-sample(5920,2538)
train<-morosidad[-muestra,]
test<-morosidad[muestra,]
dim(train)
## [1] 5920 14
dim(test)
## [1] 2538 14
modelo1<-rpart(MORA~.,data=morosidad, method="class")
modelo1
## n= 8458
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 8458 2018 NO (0.76140932 0.23859068)
## 2) MES_0< 708.885 6136 736 NO (0.88005215 0.11994785)
## 4) MES_0< 394.545 3994 244 NO (0.93890836 0.06109164) *
## 5) MES_0>=394.545 2142 492 NO (0.77030812 0.22969188)
## 10) DEUDA_TOTAL>=5926.765 1478 100 NO (0.93234100 0.06765900) *
## 11) DEUDA_TOTAL< 5926.765 664 272 SI (0.40963855 0.59036145)
## 22) MES_1< 583.805 413 196 NO (0.52542373 0.47457627)
## 44) HORA< 16.5 317 123 NO (0.61198738 0.38801262) *
## 45) HORA>=16.5 96 23 SI (0.23958333 0.76041667) *
## 23) MES_1>=583.805 251 55 SI (0.21912351 0.78087649) *
## 3) MES_0>=708.885 2322 1040 SI (0.44788975 0.55211025)
## 6) DEUDA_TOTAL>=14748.15 1262 425 NO (0.66323296 0.33676704)
## 12) MES_0< 1350.16 639 67 NO (0.89514867 0.10485133) *
## 13) MES_0>=1350.16 623 265 SI (0.42536116 0.57463884)
## 26) DEUDA_TOTAL>=32767.3 289 72 NO (0.75086505 0.24913495)
## 52) MES_0< 3014.38 242 26 NO (0.89256198 0.10743802) *
## 53) MES_0>=3014.38 47 1 SI (0.02127660 0.97872340) *
## 27) DEUDA_TOTAL< 32767.3 334 48 SI (0.14371257 0.85628743) *
## 7) DEUDA_TOTAL< 14748.15 1060 203 SI (0.19150943 0.80849057)
## 14) RESULTADOLLAMADA=MCE Msj Con Empleada,PDP Promesa De Pago 71 18 NO (0.74647887 0.25352113) *
## 15) RESULTADOLLAMADA=CAN Cliente Cancelo,CIH Generar Cita,CIT Genera Cita Otro Dia,IPN Inbound Pdp de Neg,IPP Inbound Promesa Par,MCC Msj Con Comp De Trab,MCF Msj Con Familiar,MCO Msj Con Conyuge,MCS Msj Con Secretaria,MCT Msj Con Terceros,MCV Msj Con Vecino,PAR Promesa Parcial,PPR,REC Cliente con Reclamo,RPP Recordar Promesa,TAT Tratamiento al Titu,TAT sin contacto con Titular,TELF. NO CORRESPONDE 989 150 SI (0.15166835 0.84833165) *
plot(as.party(modelo1))
prp(modelo1, type=1, extra=1)
#Predecimos en la data de prueba.
predicho1<-predict(modelo1, test, type="class")
table(predicho1, test$MORA)
##
## predicho1 NO SI
## NO 1836 172
## SI 74 456
confusionMatrix(predicho1, test$MORA)
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO SI
## NO 1836 172
## SI 74 456
##
## Accuracy : 0.9031
## 95% CI : (0.8909, 0.9143)
## No Information Rate : 0.7526
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7254
##
## Mcnemar's Test P-Value : 6.23e-10
##
## Sensitivity : 0.9613
## Specificity : 0.7261
## Pos Pred Value : 0.9143
## Neg Pred Value : 0.8604
## Prevalence : 0.7526
## Detection Rate : 0.7234
## Detection Prevalence : 0.7912
## Balanced Accuracy : 0.8437
##
## 'Positive' Class : NO
##
modelo2<-nnet(MORA~.,data = train,size=10, maxit=10000, trace=F)
predicho2<-predict(modelo2, test, type="class")
table(predicho2,test$MORA)
##
## predicho2 NO SI
## NO 1779 100
## SI 131 528