library(ggplot2)
ggplot(data,
aes(x=Recency))+
geom_histogram(fill="lightblue",
bins=nclass.Sturges(data$Recency))
ggplot(data,
aes(x=Frequency))+
geom_histogram(fill="lightblue",
bins=nclass.Sturges(data$Frequency))
ggplot((data),
aes(x=Monetary))+
geom_histogram(fill="lightblue",
bins=nclass.Sturges(data$Monetary))
Como se observó en los histogramas anteriores, los datos están lejos de ser considerados normales, por lo que se hará una transformacion logrÃtmica que permita segregar mejor los datos.
Para una correcta segmentación por K Medias, las tres variables deben coincidir en el orden de la escala de medicion. Las variables R y F estan medidas en dÃas, por lo que un valor más pequeño denota a los clientes con un mejor comportamiento. Mientras que la variable M asigna mayores valores a los cleitns que han comprado más. Para que el algoritmo de agrupamiento sea óptimo, se debe hacer una transformación para que esta escala sea inversa.
data%>%
mutate(LogMonetary=log(Monetary))%>%
mutate(LogFrquency=log(Frequency))%>%
mutate(LogRecency=log(Recency))->data2
(data2[data2$LogFrquency!=Inf &
data2$LogFrquency!=-Inf &
data2$LogMonetary!=Inf &
data2$LogMonetary!=-Inf &
data2$LogRecency!=Inf &
data2$LogRecency!=-Inf &
!is.na(data2$LogMonetary) &
!is.na(data2$LogFrquency) &
!is.na(data2$LogRecency),])->data2
data2%>%
mutate(TrnsfLogMonetary=(max(data2$LogMonetary)-LogMonetary)/(max(LogMonetary)-min(LogMonetary)))%>%
mutate(TrnsfLogFrquency=abs(min(data2$LogFrquency)-LogFrquency)/(max(LogFrquency)-min(LogFrquency)))%>%
mutate(TrnsfLogRecency=abs(min(data2$LogRecency)-LogRecency)/(max(LogRecency)-min(LogRecency)))->data2
data2%>%
head()
## CardCode TotalTrans TotalSpent UltimaFechaCompra FechaCreacionProspecto
## 1 C000041 14 253611.87 2022-02-18 2014-11-17
## 2 C000073 2 20163.74 2022-03-01 2014-11-17
## 3 C000074 12 288808.10 2022-07-05 2014-11-17
## 4 C000185 22 106097.37 2022-06-06 2014-11-17
## 5 C000215 10 24152.52 2022-03-16 2014-11-17
## 6 C000477 9 10236.67 2022-03-08 2014-11-17
## FechaLimite FechaReferencia FechaInicial
## 1 2021-01-20 20:48:46 2022-07-14 20:48:46 2021-01-20 20:48:46
## 2 2021-01-20 20:48:46 2022-07-14 20:48:46 2021-01-20 20:48:46
## 3 2021-01-20 20:48:46 2022-07-14 20:48:46 2021-01-20 20:48:46
## 4 2021-01-20 20:48:46 2022-07-14 20:48:46 2021-01-20 20:48:46
## 5 2021-01-20 20:48:46 2022-07-14 20:48:46 2021-01-20 20:48:46
## 6 2021-01-20 20:48:46 2022-07-14 20:48:46 2021-01-20 20:48:46
## CantidadCategorias MetodoEnvio ComprasPosteriores Monetary Recency Frequency
## 1 Multicategoria Multiples 3169.50 18115.134 146 30
## 2 Multicategoria Multiples NA 10081.870 135 405
## 3 Multicategoria Multiples 96575.34 24067.342 9 48
## 4 Multicategoria Multiples 2345.00 4822.608 38 23
## 5 Multicategoria Multiples 3298.11 2415.252 120 46
## 6 Multicategoria Multiples NA 1137.408 128 51
## VarObjetivo AntiguedadCotizacion Sucursal Vendedor
## 1 Retenido 160 MAYOREO Cristina Villalta
## 2 Perdido 321 ROOSEVELT Gary Gonzalez
## 3 Retenido 2 ZONA 10 Lucia Fernanda Molina Herrera
## 4 Retenido 48 ROOSEVELT Milton Lopez
## 5 Retenido 77 VENTAS CEDI Beatriz Toledo
## 6 Perdido 178 VENTAS CEDI Beatriz Toledo
## TerminoPago LogMonetary LogFrquency LogRecency TrnsfLogMonetary
## 1 Contado 9.804503 3.401197 4.983607 0.2624100
## 2 Contado 9.218494 6.003887 4.905275 0.3264101
## 3 Contado 10.088611 3.871201 2.197225 0.2313816
## 4 Contado 8.481070 3.135494 3.637586 0.4069468
## 5 Contado 7.789559 3.828641 4.787492 0.4824691
## 6 Contado 7.036507 3.931826 4.852030 0.5647126
## TrnsfLogFrquency TrnsfLogRecency
## 1 0.5407554 0.7928107
## 2 0.9545562 0.7803494
## 3 0.6154811 0.3495427
## 4 0.4985113 0.5786808
## 5 0.6087146 0.7616120
## 6 0.6251198 0.7718791
ggplot(data2,aes(x=TrnsfLogMonetary))+
geom_histogram(bins = nclass.Sturges(data2$TrnsfLogMonetary),
fill="lightblue")
ggplot(data2,aes(x=TrnsfLogFrquency))+
geom_histogram(bins = nclass.Sturges(data2$TrnsfLogFrquency),
fill="lightblue")
ggplot(data2,aes(x=TrnsfLogRecency))+
geom_histogram(bins = nclass.Sturges(data2$TrnsfLogRecency),
fill="lightblue")
ks.test(data2$Monetary,
pnorm,
mean(data2$Monetary),
sd(data2$Monetary))$p.value->PVal_M
ks.test(data2$LogMonetary,
pnorm,
mean(data2$LogMonetary),
sd(data2$LogMonetary))$p.value->PVal_logM
ks.test(data2$TrnsfLogMonetary,
pnorm,
mean(data2$TrnsfLogMonetary),
sd(data2$TrnsfLogMonetary))$p.value->PVal_TlogM
ks.test(data2$Recency,
pnorm,
mean(data2$Recency),
sd(data2$Recency))$p.value->PVal_R
ks.test(data2$LogRecency,
pnorm,
mean(data2$LogRecency),
sd(data2$LogRecency))$p.value->PVal_logR
ks.test(data2$TrnsfLogRecency,
pnorm,
mean(data2$TrnsfLogRecency),
sd(data2$TrnsfLogRecency))$p.value->PVal_TlogR
ks.test(data2$Frequency,
pnorm,
mean(data2$Frequency),
sd(data2$Frequency))$p.value->PVal_F
ks.test(data2$LogFrquency,
pnorm,
mean(data2$LogFrquency),
sd(data2$LogFrquency))$p.value->PVal_logF
ks.test(data2$TrnsfLogFrquency,
pnorm,
mean(data2$TrnsfLogFrquency),
sd(data2$TrnsfLogFrquency))$p.value->PVal_TlogF
data.frame(Variable=c("Monetary",
"Recency",
"Frequency"),
Regular=c(PVal_M,
PVal_R,
PVal_F),
Logaritmico=c(PVal_logM,
PVal_logR,
PVal_logF),
Transformado_Log=c(PVal_TlogM,
PVal_TlogR,
PVal_TlogF))->Normality
Normality
## Variable Regular Logaritmico Transformado_Log
## 1 Monetary 0 0.001344523 0.001344523
## 2 Recency 0 0.000000000 0.000000000
## 3 Frequency 0 0.193266917 0.193266917
La transformación de las variables M y F sà resultó en datos que siguen el comportamiento normal. No asà el caso de la variable R, por lo que se hará un análisis de correlación usando pruebas no paramétricas.
R_MF<-cor.test(data2$TrnsfLogMonetary,
data2$TrnsfLogFrquency,
alternative = "two.sided",method = "spearman")
## Warning in cor.test.default(data2$TrnsfLogMonetary, data2$TrnsfLogFrquency, :
## Cannot compute exact p-value with ties
ggplot(data2,aes(x=TrnsfLogMonetary,y=TrnsfLogFrquency))+
geom_point()
R_RF<-cor.test(data2$TrnsfLogRecency,
data2$TrnsfLogFrquency,
alternative = "two.sided",method = "spearman")
## Warning in cor.test.default(data2$TrnsfLogRecency, data2$TrnsfLogFrquency, :
## Cannot compute exact p-value with ties
ggplot(data2,aes(x=TrnsfLogRecency,y=TrnsfLogFrquency))+
geom_point()
R_RM<-cor.test(data2$TrnsfLogRecency,
data2$TrnsfLogMonetary,
alternative = "two.sided",method = "spearman")
## Warning in cor.test.default(data2$TrnsfLogRecency, data2$TrnsfLogMonetary, :
## Cannot compute exact p-value with ties
ggplot(data2,aes(x=TrnsfLogRecency,y=TrnsfLogMonetary))+
geom_point()
data.frame(R_Spearman=c("R","F","M"),
R=round(c(NA,R_RF$estimate,R_RM$estimate),4),
F=round(c(R_RF$estimate,NA,R_MF$estimate),4),
M=round(c(R_RM$estimate,R_MF$estimate,NA),4))
## R_Spearman R F M
## 1 R NA 0.0154 0.2204
## 2 F 0.0154 NA 0.0567
## 3 M 0.2204 0.0567 NA
Con la hipotesis nula que indica que rho es cero, se pueden observar los siguientes p valores.
data.frame(PValues=c("R","F","M"),
R=round(c(NA,R_RF$p.value,R_RM$p.value),4),
F=round(c(R_RF$p.value,NA,R_MF$p.value),4),
M=round(c(R_RM$p.value,R_MF$p.value,NA),4))
## PValues R F M
## 1 R NA 0.4374 0.0000
## 2 F 0.4374 NA 0.0042
## 3 M 0.0000 0.0042 NA
El valor P de la prueba de correlacion entre R y F si es significativo, pero no se puede considerar suficientemente alto para establecerlo como una variable explicativa.
(data.frame(CardCode=data2$CardCode,
Frequency=data2$Frequency,
Monetary=data2$Monetary,
Recency=data2$Recency))->data3
data3[data3$Frequency!=-Inf &
data3$Frequency!= Inf &
data3$Monetary!=-Inf &
data3$Monetary!= Inf &
data3$Recency!=-Inf &
data3$Recency!= Inf,]->data3
(data.frame(CardCode=data2$CardCode,
LogFrequency=data2$TrnsfLogFrquency,
LogMonetary=data2$TrnsfLogMonetary,
LogRecency=data2$TrnsfLogRecency))->data3B
data3B[data3B$LogFrequency!=-Inf &
data3B$LogFrequency!= Inf &
data3B$LogMonetary!=-Inf &
data3B$LogMonetary!= Inf &
data3B$LogRecency!=-Inf &
data3B$LogRecency!= Inf,]->data3B
S<-NULL
S2<-NULL
SSEs<-NULL
K<-8
for (i in 1:K) {
kmeans(data3[,2:4],
centers = i,
nstart = 1,
iter.max = 300)->clusters
base<-clusters$tot.withinss/clusters$totss
S<-c(S,base)
kmeans(data3B[,2:4],
centers = i,
nstart = 1,
iter.max = 300)->clusters2
base2<-clusters2$tot.withinss/clusters2$totss
S2<-c(S2,base2)
}
SSEs<-rbind(
data.frame(K=1:K,
SSE=S,
Metodo=rep("Regular",K)),
data.frame(K=1:K,
SSE=S2,
Metodo=rep("Logaritmicos",K)))
K=3
SSEs%>%
ggplot(aes(x=K,y=SSE))+
geom_line(aes(colour=Metodo))+
geom_point(aes(colour=Metodo))+
theme_classic()+
geom_vline(xintercept = K,
linetype="dashed",
colour="blue")
kmeans(as.matrix(data3B[,2:4]),centers = K,iter.max = 1000)->KMedias
data3B%>%
mutate(Segmento=KMedias$cluster)->data3B
cols<-c("blue","green","red","orange","purple")
data3B$Colours=cols[data3B$Segmento]
plot3d(x=data3B$LogFrequency,
y=data3B$LogMonetary,
z=data3B$LogRecency,
col = data3B$Colours,
xlab="LogFrequency",
ylab="LogMonetary",
zlab="LogRecency")->PlotRFM
PlotRFM
data.frame(CardCode=data2$CardCode,
Antiguedad=data2$AntiguedadCotizacion)->data3C
data3C[data3C$Antiguedad!=Inf &
data3C$Antiguedad!=-Inf ,]->data3C
S_C<-NULL
S2_C<-NULL
SSEs_C<-NULL
K_C<-7
base2_C<-NULL
for (i in 1:K_C) {
kmeans(data3C[,2],
centers = i,
nstart = 1,
iter.max = 300)->clusters
base2_C<-clusters$tot.withinss/clusters$totss
S_C<-c(S_C,base2_C)
}
S_C
## [1] 1.0000000 0.4476204 0.2392165 0.1848128 0.1620633 0.1534399 0.1475420
SSEs_C<-data.frame(K=1:K_C,
SSEs_C=S_C,
Metodo="Regular")
SSEs_C%>%
ggplot((aes(x=K,y=SSEs_C)))+
geom_line()+
geom_point()
Tambien se hará con K=6
K=6
kmeans(data3C[,2],
centers =K,
nstart = 1,
iter.max = 300)->CotiCluster
CotiCluster$centers
## [,1]
## 1 907.89744
## 2 296.09060
## 3 21.21145
## 4 175.87121
## 5 455.16552
## 6 87.99497
data.frame(cbind(CotiCluster$cluster,data3C$CardCode))->segmentosCoti
segmentosCoti%>%
mutate(SegmentoCoti=ifelse(X1=="1","B1",
ifelse(X1=="2","A1",
ifelse(X1=="3","C1",
ifelse(X1=="4","A2",
ifelse(X1=="5","B2",
"C2"))))))->segmentosCoti
merge(x = data2,
y = data3B[,c("CardCode","Segmento")],
by="CardCode")->data4
data4%>%
mutate(RankLogMonetary=percent_rank(TrnsfLogMonetary))%>%
mutate(M_Percentile=ifelse(RankLogMonetary>0.4,"M3",ifelse(RankLogMonetary>0.2,"M2","M1")))%>%
mutate(RankLogRecency=percent_rank(TrnsfLogRecency))%>%
mutate(R_Percentile=ifelse(RankLogRecency>0.4,"R3",ifelse(RankLogRecency>0.2,"R2","R1")))%>%
mutate(RankLogFrequency=percent_rank(TrnsfLogFrquency))%>%
mutate(F_Percentile=ifelse(RankLogFrequency>0.4,"F3",ifelse(RankLogFrequency>0.2,"F2","F1")))%>%
mutate(RM=paste(R_Percentile,M_Percentile))%>%
mutate(RF=paste(R_Percentile,F_Percentile))%>%
mutate(FM=paste(F_Percentile,M_Percentile))->data4
merge(x=data4,
y=segmentosCoti[,c("X2","SegmentoCoti")],
by.x ="CardCode",by.y = "X2")->data4
chisq.test(x=data4$Segmento,
y=data4$VarObjetivo)->Chisq_Segment
Chisq_Segment$observed
## data4$VarObjetivo
## data4$Segmento Perdido Retenido
## 1 148 467
## 2 573 316
## 3 654 387
Chisq_Segment$expected
## data4$VarObjetivo
## data4$Segmento Perdido Retenido
## 1 332.2692 282.7308
## 2 480.3045 408.6955
## 3 562.4263 478.5737
Chisq_Segment$p.value
## [1] 1.730067e-64
chisq.test(x=data4$CantidadCategorias,
y=data4$VarObjetivo)->Chisq_Categorias
Chisq_Categorias$observed
## data4$VarObjetivo
## data4$CantidadCategorias Perdido Retenido
## Multicategoria 1279 1141
## UniCategoria 96 29
Chisq_Categorias$expected
## data4$VarObjetivo
## data4$CantidadCategorias Perdido Retenido
## Multicategoria 1307.46562 1112.53438
## UniCategoria 67.53438 57.46562
Chisq_Categorias$p.value
## [1] 2.647693e-07
chisq.test(x=data4$MetodoEnvio,
y=data4$VarObjetivo)->Chisq_Envio
Chisq_Envio
##
## Pearson's Chi-squared test
##
## data: data4$MetodoEnvio and data4$VarObjetivo
## X-squared = 62.693, df = 2, p-value = 2.434e-14
chisq.test(x=data4$TerminoPago,
y=data4$VarObjetivo)->Chisq_Termino
## Warning in chisq.test(x = data4$TerminoPago, y = data4$VarObjetivo): Chi-squared
## approximation may be incorrect
Chisq_Termino
##
## Pearson's Chi-squared test
##
## data: data4$TerminoPago and data4$VarObjetivo
## X-squared = 12.623, df = 3, p-value = 0.005528
chisq.test(x=data4$R_Percentile,y=data4$VarObjetivo)->Chisq_R
Chisq_R
##
## Pearson's Chi-squared test
##
## data: data4$R_Percentile and data4$VarObjetivo
## X-squared = 352.84, df = 2, p-value < 2.2e-16
chisq.test(x=data4$M_Percentile,y=data4$VarObjetivo)->Chisq_M
Chisq_M
##
## Pearson's Chi-squared test
##
## data: data4$M_Percentile and data4$VarObjetivo
## X-squared = 42.407, df = 2, p-value = 6.185e-10
chisq.test(x=data4$F_Percentile,y=data4$VarObjetivo)->Chisq_F
Chisq_F
##
## Pearson's Chi-squared test
##
## data: data4$F_Percentile and data4$VarObjetivo
## X-squared = 7.4798, df = 2, p-value = 0.02376
chisq.test(x=data4$RM,y=data4$VarObjetivo)->Chisq_RM
Chisq_RM
##
## Pearson's Chi-squared test
##
## data: data4$RM and data4$VarObjetivo
## X-squared = 373.87, df = 8, p-value < 2.2e-16
chisq.test(x=data4$RF,y=data4$VarObjetivo)->Chisq_RF
Chisq_RF
##
## Pearson's Chi-squared test
##
## data: data4$RF and data4$VarObjetivo
## X-squared = 374.4, df = 8, p-value < 2.2e-16
chisq.test(x=data4$FM,y=data4$VarObjetivo)->Chisq_FM
Chisq_FM
##
## Pearson's Chi-squared test
##
## data: data4$FM and data4$VarObjetivo
## X-squared = 53.587, df = 8, p-value = 8.3e-09
chisq.test(x=data4$SegmentoCoti,y=data4$VarObjetivo)->Chisq_AntCoti
Chisq_AntCoti
##
## Pearson's Chi-squared test
##
## data: data4$SegmentoCoti and data4$VarObjetivo
## X-squared = 351.36, df = 5, p-value < 2.2e-16
chisq.test(x=data4$Sucursal,y=data4$VarObjetivo)->Chisq_Sucursal
## Warning in chisq.test(x = data4$Sucursal, y = data4$VarObjetivo): Chi-squared
## approximation may be incorrect
Chisq_Sucursal
##
## Pearson's Chi-squared test
##
## data: data4$Sucursal and data4$VarObjetivo
## X-squared = 72.885, df = 32, p-value = 5e-05
chisq.test(x=data4$Vendedor,y=data4$VarObjetivo)->Chisq_Vendedor
## Warning in chisq.test(x = data4$Vendedor, y = data4$VarObjetivo): Chi-squared
## approximation may be incorrect
Chisq_Vendedor
##
## Pearson's Chi-squared test
##
## data: data4$Vendedor and data4$VarObjetivo
## X-squared = 301.79, df = 190, p-value = 4.351e-07
data.frame(Variable=c("Metodo de Envio","Cluster","Condicion de Pago","Antiguedad Ultima Coti","Sucursal","Vendedor","R Percentil","F Percentil","M Percentil","RF Percentil","RM Percentil","FM Percentil"),PValor=round(c(Chisq_Envio$p.value,
Chisq_Segment$p.value,
Chisq_Termino$p.value,
Chisq_AntCoti$p.value,
Chisq_Sucursal$p.value,
Chisq_Vendedor$p.value,
Chisq_R$p.value,
Chisq_F$p.value,
Chisq_M$p.value,
Chisq_RF$p.value,
Chisq_RM$p.value,
Chisq_FM$p.value),6))->Pvalues
Pvalues%>%
mutate(Conclusion=ifelse(PValor>=0.05,"Aceptar Ho, las variables son Independientes","Aceptar Ha, las variables tienen asociacion"))
## Variable PValor Conclusion
## 1 Metodo de Envio 0.000000 Aceptar Ha, las variables tienen asociacion
## 2 Cluster 0.000000 Aceptar Ha, las variables tienen asociacion
## 3 Condicion de Pago 0.005528 Aceptar Ha, las variables tienen asociacion
## 4 Antiguedad Ultima Coti 0.000000 Aceptar Ha, las variables tienen asociacion
## 5 Sucursal 0.000050 Aceptar Ha, las variables tienen asociacion
## 6 Vendedor 0.000000 Aceptar Ha, las variables tienen asociacion
## 7 R Percentil 0.000000 Aceptar Ha, las variables tienen asociacion
## 8 F Percentil 0.023757 Aceptar Ha, las variables tienen asociacion
## 9 M Percentil 0.000000 Aceptar Ha, las variables tienen asociacion
## 10 RF Percentil 0.000000 Aceptar Ha, las variables tienen asociacion
## 11 RM Percentil 0.000000 Aceptar Ha, las variables tienen asociacion
## 12 FM Percentil 0.000000 Aceptar Ha, las variables tienen asociacion
data4%>%
mutate(VarObjetivoNum=ifelse(VarObjetivo=="Retenido",1,0))%>%
mutate(Canal=ifelse(Sucursal=="PROYECTOS","Proyectos",ifelse(Sucursal=="CC","Digital","Retail")))->data4
trigger=0.5
glm(VarObjetivoNum~ Segmento
## MetodoEnvio
## CantidadCategorias+
## Sucursal+TerminoPago+
## + SegmentoCoti
## R_Percentile+
## M_Percentile,
,data=data4,family = "binomial")->Model1
as.data.frame(Model1$coefficients)
## Model1$coefficients
## (Intercept) 1.4070549
## Segmento -0.7276706
predict(Model1,type = "response")->Predichos1
cbind(data4,Predichos1)->ValData1
ValData1%>%
mutate(Predicho=ifelse(Predichos1>trigger,"Retencion","Perdida"))->ValData1
table(ValData1$Predicho,ValData1$VarObjetivo)->conf.matrix1
conf.matrix1
##
## Perdido Retenido
## Perdida 1227 703
## Retencion 148 467
conf.matrix1%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto","Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe1
conf.dframe1
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1694 0.666
## 2 Incorrecto 851 0.334
Se calcula la exactitud del modelo
conf.dframe1$Prcnt[1]->Exactitud1
Exactitud1
## [1] 0.6656189
glm(VarObjetivoNum~ Segmento+
MetodoEnvio
## CantidadCategorias+
## Sucursal+TerminoPago+
## SegmentoCoti+
## R_Percentile+
## M_Percentile,
,data=data4,family = "binomial")->Model2
predict(Model2,type = "response")->Predichos2
cbind(data4,Predichos2)%>%
mutate(Predicho=ifelse(Predichos2>trigger,"Retencion","Perdida"))->ValData2
table(ValData2$Predicho,ValData2$VarObjetivo)->conf.matrix2
conf.matrix2
##
## Perdido Retenido
## Perdida 843 450
## Retencion 532 720
conf.matrix2%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto","Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe2
conf.dframe2
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1563 0.614
## 2 Incorrecto 982 0.386
Se calcula la exactitud del modelo
conf.dframe2$Prcnt[1]->Exactitud2
Exactitud2
## [1] 0.6141454
glm(VarObjetivoNum~ Segmento+
MetodoEnvio+
CantidadCategorias
## Sucursal+TerminoPago+
## SegmentoCoti+
## R_Percentile+
## M_Percentile,
,data=data4,family = "binomial")->Model3
predict(Model3,type = "response")->Predichos3
cbind(data4,Predichos3)%>%
mutate(Predicho=ifelse(Predichos3>trigger,"Retencion","Perdida"))->ValData3
table(ValData3$Predicho,ValData3$VarObjetivo)->conf.matrix3
conf.matrix3
##
## Perdido Retenido
## Perdida 862 461
## Retencion 513 709
conf.matrix3%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto","Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe3
conf.dframe3
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1571 0.617
## 2 Incorrecto 974 0.383
Se calcula la exactitud del modelo
conf.dframe3$Prcnt[1]->Exactitud3
Exactitud3
## [1] 0.6172888
glm(VarObjetivoNum~ Segmento+
MetodoEnvio+
CantidadCategorias
+ Sucursal
##+TerminoPago+
## SegmentoCoti+
## R_Percentile+
## M_Percentile,
,data=data4,family = "binomial")->Model4
predict(Model4,type = "response")->Predichos4
cbind(data4,Predichos4)%>%
mutate(Predicho=ifelse(Predichos4>trigger,"Retencion","Perdida"))->ValData4
table(ValData4$Predicho,ValData4$VarObjetivo)->conf.matrix4
conf.matrix4
##
## Perdido Retenido
## Perdida 986 501
## Retencion 389 669
conf.matrix4%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto","Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe4
conf.dframe4
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1655 0.650
## 2 Incorrecto 890 0.350
Se calcula la exactitud del modelo
conf.dframe4$Prcnt[1]->Exactitud4
Exactitud4
## [1] 0.6502947
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
+Sucursal
+TerminoPago
## +SegmentoCoti
## +R_Percentile
## +M_Percentile,
,data=data4,family = "binomial")->Model5
predict(Model5,type = "response")->Predichos5
cbind(data4,Predichos5)%>%
mutate(Predicho=ifelse(Predichos5>trigger,"Retencion","Perdida"))->ValData5
table(ValData5$Predicho,ValData5$VarObjetivo)->conf.matrix5
conf.matrix5
##
## Perdido Retenido
## Perdida 988 493
## Retencion 387 677
conf.matrix5%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto","Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe5
conf.dframe5
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1665 0.654
## 2 Incorrecto 880 0.346
Se calcula la exactitud del modelo
conf.dframe5$Prcnt[1]->Exactitud5
Exactitud5
## [1] 0.654224
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
+Sucursal
+TerminoPago
+SegmentoCoti
## +R_Percentile
## +M_Percentile,
,data=data4,family = "binomial")->Model6
predict(Model6,type = "response")->Predichos6
cbind(data4,Predichos6)%>%
mutate(Predicho=ifelse(Predichos6>trigger,"Retencion","Perdida"))->ValData6
table(ValData6$Predicho,ValData6$VarObjetivo)->conf.matrix6
conf.matrix6
##
## Perdido Retenido
## Perdida 1005 430
## Retencion 370 740
conf.matrix6%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto","Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe6
conf.dframe6
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1745 0.686
## 2 Incorrecto 800 0.314
Se calcula la exactitud del modelo
conf.dframe6$Prcnt[1]->Exactitud6
Exactitud6
## [1] 0.6856582
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
+Sucursal
+TerminoPago
+SegmentoCoti
+RM
##+M_Percentile
##+R_Percentile
,data=data4,family = "binomial")->Model7
predict(Model7,type = "response")->Predichos7
cbind(data4,Predichos7)%>%
mutate(Predicho=ifelse(Predichos7>trigger,"Retencion","Perdida"))->ValData7
table(ValData7$Predicho,ValData7$VarObjetivo)->conf.matrix7
conf.matrix7
##
## Perdido Retenido
## Perdida 1061 471
## Retencion 314 699
conf.matrix7%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7
conf.dframe7
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1760 0.692
## 2 Incorrecto 785 0.308
Se calcula la exactitud del modelo
conf.dframe7$Prcnt[1]->Exactitud7
Exactitud7
## [1] 0.6915521
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
+Sucursal
+TerminoPago
+SegmentoCoti
+F_Percentile
+R_Percentile
+M_Percentile
,data=data4,family = "binomial")->Model8
predict(Model8,type = "response")->Predichos8
cbind(data4,Predichos8)%>%
mutate(Predicho=ifelse(Predichos8>trigger,"Retencion","Perdida"))->ValData8
table(ValData8$Predicho,ValData8$VarObjetivo)->conf.matrix8
conf.matrix8
##
## Perdido Retenido
## Perdida 1041 456
## Retencion 334 714
conf.matrix8%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe8
conf.dframe8
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1755 0.690
## 2 Incorrecto 790 0.310
Se calcula la exactitud del modelo
conf.dframe8$Prcnt[1]->Exactitud8
Exactitud8
## [1] 0.6895874
glm(VarObjetivoNum ~Segmento
##+MetodoEnvio
## +CantidadCategorias
##+Sucursal
+TerminoPago
+SegmentoCoti
##+F_Percentile
##+R_Percentile
##+M_Percentile
,data=data4,family = "binomial")->Model9
predict(Model9,type = "response")->Predichos9
cbind(data4,Predichos9)%>%
mutate(Predicho=ifelse(Predichos9>trigger,"Retencion","Perdida"))->ValData9
table(ValData9$Predicho,ValData9$VarObjetivo)->conf.matrix9
conf.matrix9
##
## Perdido Retenido
## Perdida 1010 454
## Retencion 365 716
conf.matrix9%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe9
conf.dframe9
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1726 0.678
## 2 Incorrecto 819 0.322
Se calcula la exactitud del modelo
conf.dframe9$Prcnt[1]->Exactitud9
Exactitud9
## [1] 0.6781925
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
+Canal
##+Sucursal
+TerminoPago
+SegmentoCoti
+RM
##+M_Percentile
##+R_Percentile
,data=data4,family = "binomial")->Model10
predict(Model10,type = "response")->Predichos10
cbind(data4,Predichos10)%>%
mutate(Predicho=ifelse(Predichos10>trigger,"Retencion","Perdida"))->ValData10
table(ValData10$Predicho,ValData10$VarObjetivo)->conf.matrix10
conf.matrix10
##
## Perdido Retenido
## Perdida 1021 441
## Retencion 354 729
conf.matrix10%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe10
conf.dframe10
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1750 0.688
## 2 Incorrecto 795 0.312
Se calcula la exactitud del modelo
conf.dframe10$Prcnt[1]->Exactitud10
Exactitud10
## [1] 0.6876228
data.frame(Modelo=c("Modelo1",
"Modelo2",
"Modelo3",
"Modelo4",
"Modelo5",
"Modelo6",
"Modelo7",
"Modelo8",
"Modelo9",
"Modelo10"),
Exactitud_Prediccion=c(Exactitud1,
Exactitud2,
Exactitud3,
Exactitud4,
Exactitud5,
Exactitud6,
Exactitud7,
Exactitud8,
Exactitud9,
Exactitud10))
## Modelo Exactitud_Prediccion
## 1 Modelo1 0.6656189
## 2 Modelo2 0.6141454
## 3 Modelo3 0.6172888
## 4 Modelo4 0.6502947
## 5 Modelo5 0.6542240
## 6 Modelo6 0.6856582
## 7 Modelo7 0.6915521
## 8 Modelo8 0.6895874
## 9 Modelo9 0.6781925
## 10 Modelo10 0.6876228
Por el modelo de negocios, el riesgo de este tipo de predicciones es predecir la retención de un cliente y que en realidad se pierda, por lo que se calculará también el riesgo de prediccion.
conf.matrix1%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df1
risk.df1[risk.df1$Resultado=="Riesgo",]->Risk1
Risk1$Prcnt->Risk1
ifelse(length(Risk1)==0,0,Risk1)->Risk1
Risk1
## [1] 0.05815324
conf.matrix2%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df2
risk.df2[risk.df2$Resultado=="Riesgo",]->Risk2
Risk2$Prcnt->Risk2
ifelse(length(Risk2)==0,0,Risk2)->Risk2
Risk2
## [1] 0.2090373
conf.matrix3%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df3
risk.df3[risk.df3$Resultado=="Riesgo",]->Risk3
Risk3$Prcnt->Risk3
ifelse(length(Risk3)==0,0,Risk3)->Risk3
Risk3
## [1] 0.2015717
conf.matrix4%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df4
risk.df4[risk.df4$Resultado=="Riesgo",]->Risk4
Risk4$Prcnt->Risk4
ifelse(length(Risk4)==0,0,Risk4)->Risk4
Risk4
## [1] 0.1528487
conf.matrix5%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df5
risk.df5[risk.df5$Resultado=="Riesgo",]->Risk5
Risk5$Prcnt->Risk5
ifelse(length(Risk5)==0,0,Risk5)->Risk5
Risk5
## [1] 0.1520629
conf.matrix6%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df6
risk.df6[risk.df6$Resultado=="Riesgo",]->Risk6
Risk6$Prcnt->Risk6
ifelse(length(Risk6)==0,0,Risk6)->Risk6
Risk6
## [1] 0.1453831
conf.matrix7%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df7
risk.df7[risk.df7$Resultado=="Riesgo",]->Risk7
Risk7$Prcnt->Risk7
ifelse(length(Risk7)==0,0,Risk7)->Risk7
Risk7
## [1] 0.1233792
conf.matrix8%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df8
risk.df8[risk.df8$Resultado=="Riesgo",]->Risk8
Risk8$Prcnt->Risk8
ifelse(length(Risk8)==0,0,Risk8)->Risk8
Risk8
## [1] 0.1312377
conf.matrix9%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df9
risk.df9[risk.df9$Resultado=="Riesgo",]->Risk9
Risk9$Prcnt->Risk9
ifelse(length(Risk9)==0,0,Risk9)->Risk9
Risk9
## [1] 0.1434185
conf.matrix10%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df10
risk.df10[risk.df10$Resultado=="Riesgo",]->Risk10
Risk10$Prcnt->Risk10
ifelse(length(Risk10)==0,0,Risk9)->Risk10
Risk10
## [1] 0.1434185
data.frame(Modelo=c("Modelo1",
"Modelo2",
"Modelo3",
"Modelo4",
"Modelo5",
"Modelo6",
"Modelo7",
"Modelo8",
"Modelo9",
"Modelo10"),
Exactitud_Prediccion=c(Exactitud1,
Exactitud2,
Exactitud3,
Exactitud4,
Exactitud5,
Exactitud6,
Exactitud7,
Exactitud8,
Exactitud9,
Exactitud10),
Riesgo_Prediccion=c(Risk1,
Risk2,
Risk3,
Risk4,
Risk5,
Risk6,
Risk7,
Risk8,
Risk9,
Risk10),
Cant_Variables=c(length(Model1$coefficients),
length(Model2$coefficients),
length(Model3$coefficients),
length(Model4$coefficients),
length(Model5$coefficients),
length(Model6$coefficients),
length(Model7$coefficients),
length(Model8$coefficients),
length(Model9$coefficients),
length(Model10$coefficients)),
AIC=c(Model1$aic,
Model2$aic,
Model3$aic,
Model4$aic,
Model5$aic,
Model6$aic,
Model7$aic,
Model8$aic,
Model9$aic,
Model10$aic
))->ResumenModelos
ResumenModelos
## Modelo Exactitud_Prediccion Riesgo_Prediccion Cant_Variables AIC
## 1 Modelo1 0.6656189 0.05815324 2 3318.883
## 2 Modelo2 0.6141454 0.20903733 4 3287.796
## 3 Modelo3 0.6172888 0.20157171 5 3275.277
## 4 Modelo4 0.6502947 0.15284872 37 3274.522
## 5 Modelo5 0.6542240 0.15206287 40 3276.866
## 6 Modelo6 0.6856582 0.14538310 45 3105.717
## 7 Modelo7 0.6915521 0.12337917 53 3056.798
## 8 Modelo8 0.6895874 0.13123772 51 3060.783
## 9 Modelo9 0.6781925 0.14341847 10 3110.777
## 10 Modelo10 0.6876228 0.14341847 23 3041.615
Se hace un análisis de la exactitud vs la cantidad de variables (transformadas en variables dummy)
ResumenModelos%>%
ggplot(mapping = aes(x=Exactitud_Prediccion,y=AIC))+
geom_point(aes(colour=Modelo))+
geom_label_repel(aes(label = Modelo),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
# Analisis de Multicolinealidad
Se hará el analisis utilizando únicamente los modelos que tuvieron una exactitud mayor al 67%.
ResumenModelos[ResumenModelos$Exactitud_Prediccion>=0.67,]->BestModels
BestModels%>%
ggplot(mapping = aes(x=Exactitud_Prediccion,y=AIC))+
geom_point(aes(colour=Modelo))+
geom_label_repel(aes(label = Modelo),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
Se procede a iterar con el modelo 7 y modelo 10 eliminando la variable
con mayor factor de nflacion de la varianza
data.frame(vif(Model7))
## GVIF Df GVIF..1..2.Df..
## Segmento 2.055955 1 1.433860
## MetodoEnvio 1.306133 2 1.069047
## CantidadCategorias 1.067740 1 1.033315
## Sucursal 2.320556 32 1.013240
## TerminoPago 1.314849 3 1.046677
## SegmentoCoti 1.907241 5 1.066696
## RM 3.294985 8 1.077372
Se observa que las variables RM, Sucursal y Segmento tienen una alto factor de inflación de la varianza. Se harán iteraciones con estas variables.
##Analisis de modelo 7
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
+Sucursal
+TerminoPago
+SegmentoCoti
##+RM
##+M_Percentile
##+R_Percentile
,data=data4,family = "binomial")->Model7A
vif(Model7A)
## GVIF Df GVIF^(1/(2*Df))
## Segmento 1.330947 1 1.153667
## MetodoEnvio 1.265765 2 1.060689
## CantidadCategorias 1.063388 1 1.031207
## Sucursal 1.969262 32 1.010645
## TerminoPago 1.313526 3 1.046501
## SegmentoCoti 1.439995 5 1.037137
predict(Model7A,type = "response")->Predichos7A
cbind(data4,Predichos7A)%>%
mutate(Predicho=ifelse(Predichos7A>trigger,"Retencion","Perdida"))->ValData7A
table(ValData7A$Predicho,ValData7A$VarObjetivo)->conf.matrix7A
conf.matrix7A
##
## Perdido Retenido
## Perdida 1005 430
## Retencion 370 740
conf.matrix7A%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7A
conf.dframe7A
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1745 0.686
## 2 Incorrecto 800 0.314
Se calcula la exactitud del modelo
conf.dframe7A$Prcnt[1]->Exactitud7A
Exactitud7A
## [1] 0.6856582
conf.matrix7A%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df7A
risk.df7A[risk.df9$Resultado=="Riesgo",]->Risk7A
Risk7A$Prcnt->Risk7A
ifelse(length(Risk7A)==0,0,Risk9)->Risk7A
Risk7A
## [1] 0.1434185
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
##+Sucursal
+TerminoPago
+SegmentoCoti
+RM
##+M_Percentile
##+R_Percentile
,data=data4,family = "binomial")->Model7B
vif(Model7B)
## GVIF Df GVIF^(1/(2*Df))
## Segmento 1.915237 1 1.383921
## MetodoEnvio 1.110801 2 1.026618
## CantidadCategorias 1.032767 1 1.016251
## TerminoPago 1.044441 3 1.007273
## SegmentoCoti 1.707743 5 1.054975
## RM 2.749176 8 1.065247
predict(Model7B,type = "response")->Predichos7B
cbind(data4,Predichos7B)%>%
mutate(Predicho=ifelse(Predichos7B>trigger,"Retencion","Perdida"))->ValData7B
table(ValData7B$Predicho,ValData7B$VarObjetivo)->conf.matrix7B
conf.matrix7B
##
## Perdido Retenido
## Perdida 1012 436
## Retencion 363 734
conf.matrix7B%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7B
conf.dframe7B
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1746 0.686
## 2 Incorrecto 799 0.314
Se calcula la exactitud del modelo
conf.dframe7B$Prcnt[1]->Exactitud7B
Exactitud7B
## [1] 0.6860511
conf.matrix7B%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df7B
risk.df7B[risk.df9$Resultado=="Riesgo",]->Risk7B
Risk7B$Prcnt->Risk7B
ifelse(length(Risk7B)==0,0,Risk9)->Risk7B
Risk7B
## [1] 0.1434185
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
##+Sucursal
+TerminoPago
+SegmentoCoti
##+RM
##+M_Percentile
##+R_Percentile
,data=data4,family = "binomial")->Model7C
vif(Model7C)
## GVIF Df GVIF^(1/(2*Df))
## Segmento 1.228670 1 1.108454
## MetodoEnvio 1.065284 2 1.015936
## CantidadCategorias 1.030357 1 1.015065
## TerminoPago 1.020115 3 1.003325
## SegmentoCoti 1.251949 5 1.022724
predict(Model7C,type = "response")->Predichos7C
cbind(data4,Predichos7C)%>%
mutate(Predicho=ifelse(Predichos7C>trigger,"Retencion","Perdida"))->ValData7C
table(ValData7C$Predicho,ValData7C$VarObjetivo)->conf.matrix7C
conf.matrix7C
##
## Perdido Retenido
## Perdida 1053 488
## Retencion 322 682
conf.matrix7C%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7C
conf.dframe7C
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1735 0.682
## 2 Incorrecto 810 0.318
Se calcula la exactitud del modelo
conf.dframe7C$Prcnt[1]->Exactitud7C
Exactitud7C
## [1] 0.6817289
conf.matrix7C%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df7C
risk.df7C[risk.df9$Resultado=="Riesgo",]->Risk7C
Risk7C$Prcnt->Risk7C
ifelse(length(Risk7C)==0,0,Risk9)->Risk7C
Risk7C
## [1] 0.1434185
data.frame(vif(Model10))
## GVIF Df GVIF..1..2.Df..
## Segmento 1.933259 1 1.390417
## MetodoEnvio 1.113846 2 1.027321
## CantidadCategorias 1.033804 1 1.016762
## Canal 1.265162 2 1.060563
## TerminoPago 1.252074 3 1.038178
## SegmentoCoti 1.725159 5 1.056046
## RM 2.798651 8 1.066435
La variable RM muestra el factor de inflacion de varianza más alto, por lo que se hará la iteración eliminando esta variación.
glm(VarObjetivoNum ~Segmento
+MetodoEnvio
+CantidadCategorias
##+Sucursal
+TerminoPago
+SegmentoCoti
##+RM
##+M_Percentile
##+R_Percentile
,data=data4,family = "binomial")->Model10A
vif(Model10A)
## GVIF Df GVIF^(1/(2*Df))
## Segmento 1.228670 1 1.108454
## MetodoEnvio 1.065284 2 1.015936
## CantidadCategorias 1.030357 1 1.015065
## TerminoPago 1.020115 3 1.003325
## SegmentoCoti 1.251949 5 1.022724
predict(Model10A,type = "response")->Predichos10A
cbind(data4,Predichos10A)%>%
mutate(Predicho=ifelse(Predichos10A>trigger,"Retencion","Perdida"))->ValData10A
table(ValData10A$Predicho,ValData10A$VarObjetivo)->conf.matrix10A
conf.matrix10A
##
## Perdido Retenido
## Perdida 1053 488
## Retencion 322 682
conf.matrix10A%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe10A
conf.dframe10A
## # A tibble: 2 × 3
## Resultado Freq Prcnt
## <chr> <int> <dbl>
## 1 Correcto 1735 0.682
## 2 Incorrecto 810 0.318
Se calcula la exactitud del modelo
conf.dframe10A$Prcnt[1]->Exactitud10A
Exactitud10A
## [1] 0.6817289
conf.matrix10A%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Retencion" & Var2=="Perdido","Riesgo","Sin Riesgo"))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->risk.df10A
risk.df10A[risk.df9$Resultado=="Riesgo",]->Risk10A
Risk10A$Prcnt->Risk10A
ifelse(length(Risk10A)==0,0,Risk9)->Risk10A
Risk10A
## [1] 0.1434185
data.frame(Modelo=c("Modelo7A",
"Modelo7B",
"Modelo7C",
"Modelo10A"),
Exactitud_Prediccion=c(Exactitud7A,
Exactitud7B,
Exactitud7C,
Exactitud10A),
Riesgo_Prediccion=c(Risk7A,
Risk7B,
Risk7C,
Risk10A),
Cant_Variables=c(length(Model7A$coefficients),
length(Model7B$coefficients),
length(Model7C$coefficients),
length(Model10A$coefficients)),
AIC=c(Model7A$aic,
Model7B$aic,
Model7C$aic,
Model10A$aic))->ResumenModelos7
ResumenModelos7%>%
rbind(BestModels)%>%
ggplot(mapping = aes(x=Exactitud_Prediccion,y=AIC))+
geom_point(aes(colour=Modelo))+
geom_label_repel(aes(label = Modelo),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')
#Ciclo FOR para optimizar el trigger
base<-NULL
base2<-NULL
for (trigger in (1:100)/100) {
## Modelo 1
predict(Model1,type = "response")->Predichos1
cbind(data4,Predichos1)%>%
mutate(Predicho=ifelse(Predichos1>trigger,"Retencion","Perdida"))->ValData1
table(ValData1$Predicho,ValData1$VarObjetivo)->conf.matrix1
conf.matrix1%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe1
conf.dframe1$Prcnt[1]->Exactitud1
## Modelo 2
predict(Model2,type = "response")->Predichos2
cbind(data4,Predichos2)%>%
mutate(Predicho=ifelse(Predichos2>trigger,"Retencion","Perdida"))->ValData2
table(ValData2$Predicho,ValData2$VarObjetivo)->conf.matrix2
conf.matrix2%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe2
conf.dframe2$Prcnt[1]->Exactitud2
## Modelo 3
predict(Model3,type = "response")->Predichos3
cbind(data4,Predichos3)%>%
mutate(Predicho=ifelse(Predichos3>trigger,"Retencion","Perdida"))->ValData3
table(ValData3$Predicho,ValData3$VarObjetivo)->conf.matrix3
conf.matrix3%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe3
conf.dframe3$Prcnt[1]->Exactitud3
## Modelo 4
predict(Model4,type = "response")->Predichos4
cbind(data4,Predichos4)%>%
mutate(Predicho=ifelse(Predichos4>trigger,"Retencion","Perdida"))->ValData4
table(ValData4$Predicho,ValData4$VarObjetivo)->conf.matrix4
conf.matrix4%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe4
conf.dframe4$Prcnt[1]->Exactitud4
## Modelo 5
predict(Model5,type = "response")->Predichos5
cbind(data4,Predichos5)%>%
mutate(Predicho=ifelse(Predichos5>trigger,"Retencion","Perdida"))->ValData5
table(ValData5$Predicho,ValData5$VarObjetivo)->conf.matrix5
conf.matrix5%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe5
conf.dframe5$Prcnt[1]->Exactitud5
## Modelo 6
predict(Model6,type = "response")->Predichos6
cbind(data4,Predichos6)%>%
mutate(Predicho=ifelse(Predichos6>trigger,"Retencion","Perdida"))->ValData6
table(ValData6$Predicho,ValData6$VarObjetivo)->conf.matrix6
conf.matrix6%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe6
conf.dframe6$Prcnt[1]->Exactitud6
## Modelo 7
predict(Model7,type = "response")->Predichos7
cbind(data4,Predichos7)%>%
mutate(Predicho=ifelse(Predichos7>trigger,"Retencion","Perdida"))->ValData7
table(ValData7$Predicho,ValData7$VarObjetivo)->conf.matrix7
conf.matrix7%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7
conf.dframe7$Prcnt[1]->Exactitud7
## Modelo 8
predict(Model8,type = "response")->Predichos8
cbind(data4,Predichos8)%>%
mutate(Predicho=ifelse(Predichos8>trigger,"Retencion","Perdida"))->ValData8
table(ValData8$Predicho,ValData8$VarObjetivo)->conf.matrix8
conf.matrix8%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe8
conf.dframe8$Prcnt[1]->Exactitud8
## Modelo 9
predict(Model9,type = "response")->Predichos9
cbind(data4,Predichos9)%>%
mutate(Predicho=ifelse(Predichos9>trigger,"Retencion","Perdida"))->ValData9
table(ValData9$Predicho,ValData9$VarObjetivo)->conf.matrix9
conf.matrix9%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe9
conf.dframe9$Prcnt[1]->Exactitud9
## Modelo 10
predict(Model10,type = "response")->Predichos10
cbind(data4,Predichos10)%>%
mutate(Predicho=ifelse(Predichos10>trigger,"Retencion","Perdida"))->ValData10
table(ValData10$Predicho,ValData10$VarObjetivo)->conf.matrix10
conf.matrix10%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe10
conf.dframe10$Prcnt[1]->Exactitud10
## Modelo 7A
predict(Model7A,type = "response")->Predichos7A
cbind(data4,Predichos7A)%>%
mutate(Predicho=ifelse(Predichos7A>trigger,"Retencion","Perdida"))->ValData7A
table(ValData7A$Predicho,ValData7A$VarObjetivo)->conf.matrix7A
conf.matrix7A%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7A
conf.dframe7A$Prcnt[1]->Exactitud7A
## Modelo 7B
predict(Model7B,type = "response")->Predichos7B
cbind(data4,Predichos7B)%>%
mutate(Predicho=ifelse(Predichos7B>trigger,"Retencion","Perdida"))->ValData7B
table(ValData7B$Predicho,ValData7B$VarObjetivo)->conf.matrix7B
conf.matrix7B%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7B
conf.dframe7B$Prcnt[1]->Exactitud7B
## Modelo 7C
predict(Model7C,type = "response")->Predichos7C
cbind(data4,Predichos7C)%>%
mutate(Predicho=ifelse(Predichos7C>trigger,"Retencion","Perdida"))->ValData7C
table(ValData7C$Predicho,ValData7C$VarObjetivo)->conf.matrix7C
conf.matrix7C%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe7C
conf.dframe7C$Prcnt[1]->Exactitud7C
## Modelo 10A
predict(Model10A,type = "response")->Predichos10A
cbind(data4,Predichos10A)%>%
mutate(Predicho=ifelse(Predichos10A>trigger,"Retencion","Perdida"))->ValData10A
table(ValData10A$Predicho,ValData10A$VarObjetivo)->conf.matrix10A
conf.matrix10A%>%
data.frame()%>%
mutate(Resultado=ifelse(Var1=="Perdida" & Var2=="Perdido","Correcto",
ifelse(Var1=="Retencion" & Var2=="Retenido","Correcto",
"Incorrecto")))%>%
group_by(Resultado)%>%
summarise(Freq=sum(Freq))%>%
mutate(Prcnt=Freq/sum(Freq))->conf.dframe10A
conf.dframe10A$Prcnt[1]->Exactitud10A
base<-c(trigger,Exactitud1,Exactitud2,Exactitud3,Exactitud4,Exactitud5,Exactitud6,Exactitud7,Exactitud8,Exactitud9,Exactitud10,Exactitud7A,Exactitud7B,Exactitud7C,Exactitud10A)
base2<-rbind(base2,base)
}
data.frame(base2)->Accurancies
colnames(Accurancies)<-c("trigger","Exactitud1","Exactitud2","Exactitud3","Exactitud4","Exactitud5","Exactitud6","Exactitud7","Exactitud8","Exactitud9","Exactitud10","Exactitud7A","Exactitud7B","Exactitud7C","Exactitud10A")
Accurancies%>%
ggplot()+
geom_line(mapping=aes(x=trigger,y=Exactitud1,colour="Modelo1"))+
geom_line(mapping=aes(x=trigger,y=Exactitud2,colour="Modelo2"))+
geom_line(mapping=aes(x=trigger,y=Exactitud3,colour="Modelo3"))+
geom_line(mapping=aes(x=trigger,y=Exactitud4,colour="Modelo4"))+
geom_line(mapping=aes(x=trigger,y=Exactitud5,colour="Modelo5"))+
geom_line(mapping=aes(x=trigger,y=Exactitud6,colour="Modelo6"))+
geom_line(mapping=aes(x=trigger,y=Exactitud7,colour="Modelo7"))+
geom_line(mapping=aes(x=trigger,y=Exactitud8,colour="Modelo8"))+
geom_line(mapping=aes(x=trigger,y=Exactitud9,colour="Modelo9"))+
geom_line(mapping=aes(x=trigger,y=Exactitud10,colour="Modelo10"))+
geom_line(mapping=aes(x=trigger,y=Exactitud7A,colour="Modelo7A"))+
geom_line(mapping=aes(x=trigger,y=Exactitud7B,colour="Modelo7B"))+
geom_line(mapping=aes(x=trigger,y=Exactitud7C,colour="Modelo7C"))+
geom_line(mapping=aes(x=trigger,y=Exactitud10A,colour="Modelo10A"))
## Analizando los mejores modelos obtenidos
Accurancies%>%
mutate(AvgAcurBestModels=(Exactitud7+Exactitud8+Exactitud10+Exactitud7A)/4)->Accurancies
Accurancies[order(-Accurancies$AvgAcurBestModels),]->tops
tops$trigger[1]->BestTrigger
Accurancies%>%
ggplot()+
geom_line(mapping=aes(x=trigger,y=Exactitud7,colour="Modelo7"))+
geom_line(mapping=aes(x=trigger,y=Exactitud8,colour="Modelo8"))+
geom_line(mapping=aes(x=trigger,y=Exactitud10,colour="Modelo10"))+
geom_line(mapping=aes(x=trigger,y=Exactitud7A,colour="Modelo7A"))+
geom_line(mapping=aes(x=trigger,y=AvgAcurBestModels),color='black',linetype="dashed",size=0.5)+
geom_vline(xintercept = BestTrigger,linetype="dashed",size=0.5)+
xlim(0.25,0.75)+
ylim(0.6,0.7)
## Warning: Removed 50 row(s) containing missing values (geom_path).
## Removed 50 row(s) containing missing values (geom_path).
## Removed 50 row(s) containing missing values (geom_path).
## Warning: Removed 51 row(s) containing missing values (geom_path).
## Warning: Removed 50 row(s) containing missing values (geom_path).