des <- read.csv('DatosDesafiliado.csv', header = T, stringsAsFactors =TRUE)
str (des)
## 'data.frame': 4500 obs. of 6 variables:
## $ Plan_internacional : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Minutos_dia : num 203 264 102 229 125 ...
## $ Minutos_internacionales : num 9 7.5 9.4 7.4 10.2 15.2 13.2 8.3 10.8 11.3 ...
## $ Reclamos : int 3 2 3 3 2 2 1 1 3 2 ...
## $ Llamadas_internacionales: int 3 4 6 6 7 5 4 2 4 5 ...
## $ Desafiliado : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
head(des)
## Plan_internacional Minutos_dia Minutos_internacionales Reclamos
## 1 no 202.9 9.0 3
## 2 no 264.5 7.5 2
## 3 no 101.7 9.4 3
## 4 no 229.2 7.4 3
## 5 no 125.0 10.2 2
## 6 no 188.5 15.2 2
## Llamadas_internacionales Desafiliado
## 1 3 no
## 2 4 yes
## 3 6 no
## 4 6 no
## 5 7 no
## 6 5 no
table(des$Desafiliado)
##
## no yes
## 4293 207
prop.table(table(des$Desafiliado))
##
## no yes
## 0.954 0.046
barplot(prop.table(table(des$Desafiliado)),
col = rainbow(2),
ylim = c(0, 1),
main = "Distribucion de Clases")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.4
ggplot(data = des, aes(x = Desafiliado, y = Minutos_dia)) +
geom_boxplot(fill = rainbow(2))
ggplot(data = des, aes(x = Desafiliado, y = Reclamos)) +
geom_boxplot(fill = rainbow(2))
### Podemos observar que los clientes con mas reclamos son los que mas se desafilian de la empresa
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
des %>%
count(Desafiliado, Reclamos) %>%
ggplot(mapping = aes(x = Desafiliado, y = Reclamos)) +
geom_tile(mapping = aes(fill = n)) +
labs(x = "Desafiliado", y = "Reclamos") +
scale_fill_continuous(name = "Frecuencia")
## . Utilizar un modelo de regresión logística para predecir la pérdida del cliente en base a su consumo telefónico. Realizar el modelo predictivo con la información tal cual se encuentra, sin balancear los datos. Para esto, además, deberá separar la base de datos en una muestra para el entrenamiento y otra para la evaluación (es su criterio elegir la proporción adecuada).
library(caret)
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
split <- 0.8 # Porcentaje de datos al conjunto de entrenamiento
trainIndex <- createDataPartition(des$Desafiliado, p = split, list = FALSE)
des_train <- des[trainIndex,]
des_test <- des[-trainIndex,]
#Modelo Logistico
des_nbal <- glm(Desafiliado ~., data = des_train, family = binomial(link = "logit"))
summary(des_nbal)
##
## Call:
## glm(formula = Desafiliado ~ ., family = binomial(link = "logit"),
## data = des_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4449 -0.3048 -0.2086 -0.1449 3.2516
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.856797 0.570306 -13.776 < 2e-16 ***
## Plan_internacionalyes 1.811382 0.202432 8.948 < 2e-16 ***
## Minutos_dia 0.016488 0.001794 9.192 < 2e-16 ***
## Minutos_internacionales 0.066348 0.031182 2.128 0.0334 *
## Reclamos 0.432416 0.058170 7.434 1.06e-13 ***
## Llamadas_internacionales -0.034341 0.035702 -0.962 0.3361
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1345.8 on 3600 degrees of freedom
## Residual deviance: 1134.4 on 3595 degrees of freedom
## AIC: 1146.4
##
## Number of Fisher Scoring iterations: 7
###Prediccion del Modelo
des_nbal_pred <- predict(des_nbal, newdata = des_test, type = "response")
head(des_nbal_pred)
## 3 7 8 10 16 17
## 0.011372920 0.046137619 0.012031130 0.012001713 0.012110974 0.009142702
###Nivel de Desafiliación
des_nbal_pred_clase <- factor(ifelse(des_nbal_pred > 0.5, 1, 0))
levels(des_nbal_pred_clase) <- c("No","Si")
levels(des_test$Desafiliado) <- c("No","Si")
table(Predic = des_nbal_pred_clase, Real = des_test$Desafiliado)
## Real
## Predic No Si
## No 858 39
## Si 0 2
library(caret)
confusionMatrix(des_nbal_pred_clase,
des_test$Desafiliado,
positive = "Si",
mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Si
## No 858 39
## Si 0 2
##
## Accuracy : 0.9566
## 95% CI : (0.9412, 0.969)
## No Information Rate : 0.9544
## P-Value [Acc > NIR] : 0.4142
##
## Kappa : 0.0892
##
## Mcnemar's Test P-Value : 1.166e-09
##
## Sensitivity : 0.048780
## Specificity : 1.000000
## Pos Pred Value : 1.000000
## Neg Pred Value : 0.956522
## Precision : 1.000000
## Recall : 0.048780
## F1 : 0.093023
## Prevalence : 0.045606
## Detection Rate : 0.002225
## Detection Prevalence : 0.002225
## Balanced Accuracy : 0.524390
##
## 'Positive' Class : Si
##
confusionMatrix(des_nbal_pred_clase, des_test$Desafiliado, positive = "Si")$overall[1]
## Accuracy
## 0.9566185
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.0.4
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
ce(actual = des_test$Desafiliado, predicted = des_nbal_pred_clase)
## [1] 0.04338154
library(Metrics)
auc(actual = ifelse(des_test$Desafiliado == "Si", 1, 0),
predicted = des_nbal_pred)
## [1] 0.8104497
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.4
ROCRpre <- prediction(des_nbal_pred, des_test$Desafiliado)
ROCRper <- performance(ROCRpre, "tpr", "fpr")
plot(ROCRper)
plot(ROCRper, colorize = TRUE)
plot(ROCRper, colorize = TRUE, print.cutoffs.at = seq(0, 1, by = 0.1), text.adj = c(-0.2,1.7))
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.4
## Loaded ROSE 0.0-3
prop.table(table(des_train$Desafiliado))
##
## no yes
## 0.95390169 0.04609831
des_bal_over <- ovun.sample(Desafiliado ~ .,
data = des_train,
method = "over",
N = table(des_train$Desafiliado)[1]*2)$data
table(des_bal_over$default)
## < table of extent 0 >
head(des_bal_over)
## Plan_internacional Minutos_dia Minutos_internacionales Reclamos
## 1 no 202.9 9.0 3
## 2 no 229.2 7.4 3
## 3 no 125.0 10.2 2
## 4 no 188.5 15.2 2
## 5 no 155.6 10.8 3
## 6 no 110.8 8.9 2
## Llamadas_internacionales Desafiliado
## 1 3 no
## 2 6 no
## 3 7 no
## 4 5 no
## 5 4 no
## 6 4 no
library(caret)
split <- 0.8 # Porcentaje de datos al conjunto de entrenamiento
trainIndexs <- createDataPartition(des$Desafiliado, p = split, list = FALSE)
des_trainOv <- des[trainIndex,]
des_testOv <- des[-trainIndex,]
#Modelo Logistico
des_nbalOv <- glm(Desafiliado ~., data = des_train, family = binomial(link = "logit"))
summary(des_nbalOv)
##
## Call:
## glm(formula = Desafiliado ~ ., family = binomial(link = "logit"),
## data = des_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4449 -0.3048 -0.2086 -0.1449 3.2516
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.856797 0.570306 -13.776 < 2e-16 ***
## Plan_internacionalyes 1.811382 0.202432 8.948 < 2e-16 ***
## Minutos_dia 0.016488 0.001794 9.192 < 2e-16 ***
## Minutos_internacionales 0.066348 0.031182 2.128 0.0334 *
## Reclamos 0.432416 0.058170 7.434 1.06e-13 ***
## Llamadas_internacionales -0.034341 0.035702 -0.962 0.3361
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1345.8 on 3600 degrees of freedom
## Residual deviance: 1134.4 on 3595 degrees of freedom
## AIC: 1146.4
##
## Number of Fisher Scoring iterations: 7
des_nbal_predOv<- predict(des_nbalOv, newdata = des_test, type = "response")
head(des_nbal_predOv)
## 3 7 8 10 16 17
## 0.011372920 0.046137619 0.012031130 0.012001713 0.012110974 0.009142702
des_nbal_predOv_clase <- factor(ifelse(des_nbal_predOv > 0.5, 1, 0))
levels(des_nbal_predOv_clase) <- c("No","Si")
levels(des_testOv$Desafiliado) <- c("No","Si")
table(Predics = des_nbal_predOv_clase, Real = des_test$Desafiliado)
## Real
## Predics No Si
## No 858 39
## Si 0 2
library(caret)
confusionMatrix(des_nbal_predOv_clase,
des_test$Desafiliado,
positive = "Si",
mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Si
## No 858 39
## Si 0 2
##
## Accuracy : 0.9566
## 95% CI : (0.9412, 0.969)
## No Information Rate : 0.9544
## P-Value [Acc > NIR] : 0.4142
##
## Kappa : 0.0892
##
## Mcnemar's Test P-Value : 1.166e-09
##
## Sensitivity : 0.048780
## Specificity : 1.000000
## Pos Pred Value : 1.000000
## Neg Pred Value : 0.956522
## Precision : 1.000000
## Recall : 0.048780
## F1 : 0.093023
## Prevalence : 0.045606
## Detection Rate : 0.002225
## Detection Prevalence : 0.002225
## Balanced Accuracy : 0.524390
##
## 'Positive' Class : Si
##
library(performanceEstimation)
## Warning: package 'performanceEstimation' was built under R version 4.0.4
set.seed(2019) # Para tener resultados reproducibles
des_bal_smote <- performanceEstimation::smote(Desafiliado ~ .,
data = des_train,
perc.over = 2,
k = 5,
perc.under = 2)
table(des_bal_smote$Desafiliado)
##
## no yes
## 664 498
prop.table(table(des_bal_smote$Desafiliado))
##
## no yes
## 0.5714286 0.4285714
library(caret)
split <- 0.8 # Porcentaje de datos al conjunto de entrenamiento
trainIndexs <- createDataPartition(des$Desafiliado, p = split, list = FALSE)
des_trainSmote <- des[trainIndex,]
des_testSmote <- des[-trainIndex,]
#Modelo Logistico
des_nbalsmote <- glm(Desafiliado ~., data = des_train, family = binomial(link = "logit"))
summary(des_nbalsmote)
##
## Call:
## glm(formula = Desafiliado ~ ., family = binomial(link = "logit"),
## data = des_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4449 -0.3048 -0.2086 -0.1449 3.2516
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.856797 0.570306 -13.776 < 2e-16 ***
## Plan_internacionalyes 1.811382 0.202432 8.948 < 2e-16 ***
## Minutos_dia 0.016488 0.001794 9.192 < 2e-16 ***
## Minutos_internacionales 0.066348 0.031182 2.128 0.0334 *
## Reclamos 0.432416 0.058170 7.434 1.06e-13 ***
## Llamadas_internacionales -0.034341 0.035702 -0.962 0.3361
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1345.8 on 3600 degrees of freedom
## Residual deviance: 1134.4 on 3595 degrees of freedom
## AIC: 1146.4
##
## Number of Fisher Scoring iterations: 7
des_nbal_predsmote <- predict(des_nbalsmote, newdata = des_test, type = "response")
head(des_nbal_pred)
## 3 7 8 10 16 17
## 0.011372920 0.046137619 0.012031130 0.012001713 0.012110974 0.009142702
des_nbal_predsmote_clase <- factor(ifelse(des_nbal_predsmote > 0.5, 1, 0))
levels(des_nbal_predsmote_clase) <- c("No","Si")
levels(des_testSmote$Desafiliado) <- c("No","Si")
table(Predic = des_nbal_predsmote_clase, Real = des_test$Desafiliado)
## Real
## Predic No Si
## No 858 39
## Si 0 2
library(caret)
confusionMatrix(des_nbal_predsmote_clase,
des_test$Desafiliado,
positive = "Si",
mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Si
## No 858 39
## Si 0 2
##
## Accuracy : 0.9566
## 95% CI : (0.9412, 0.969)
## No Information Rate : 0.9544
## P-Value [Acc > NIR] : 0.4142
##
## Kappa : 0.0892
##
## Mcnemar's Test P-Value : 1.166e-09
##
## Sensitivity : 0.048780
## Specificity : 1.000000
## Pos Pred Value : 1.000000
## Neg Pred Value : 0.956522
## Precision : 1.000000
## Recall : 0.048780
## F1 : 0.093023
## Prevalence : 0.045606
## Detection Rate : 0.002225
## Detection Prevalence : 0.002225
## Balanced Accuracy : 0.524390
##
## 'Positive' Class : Si
##
Podemos Observar que no existe variación en el Acurracy de las preguntas 3,4,5