
Contexto
Llamar paquetes e librerias
#install.packages("tidyverse")
#install.packages("caret")
#install.packages("e1071")
library(tidyverse)
library(caret)
library(e1071)
Importar base de datos
telco <- read.csv("C:\\Users\\admin\\Downloads\\Telco Customer Churn.csv")
Entender los datos
head(telco)
## customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## 1 7590-VHVEG Female 0 Yes No 1 No
## 2 5575-GNVDE Male 0 No No 34 Yes
## 3 3668-QPYBK Male 0 No No 2 Yes
## 4 7795-CFOCW Male 0 No No 45 No
## 5 9237-HQITU Female 0 No No 2 Yes
## 6 9305-CDSKC Female 0 No No 8 Yes
## MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection
## 1 No phone service DSL No Yes No
## 2 No DSL Yes No Yes
## 3 No DSL Yes Yes No
## 4 No phone service DSL Yes No Yes
## 5 No Fiber optic No No No
## 6 Yes Fiber optic No No Yes
## TechSupport StreamingTV StreamingMovies Contract PaperlessBilling
## 1 No No No Month-to-month Yes
## 2 No No No One year No
## 3 No No No Month-to-month Yes
## 4 Yes No No One year No
## 5 No No No Month-to-month Yes
## 6 No Yes Yes Month-to-month Yes
## PaymentMethod MonthlyCharges TotalCharges Churn
## 1 Electronic check 29.85 29.85 No
## 2 Mailed check 56.95 1889.50 No
## 3 Mailed check 53.85 108.15 Yes
## 4 Bank transfer (automatic) 42.30 1840.75 No
## 5 Electronic check 70.70 151.65 Yes
## 6 Electronic check 99.65 820.50 Yes
summary(telco)
## customerID gender SeniorCitizen Partner
## Length:7043 Length:7043 Min. :0.0000 Length:7043
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :0.0000 Mode :character
## Mean :0.1621
## 3rd Qu.:0.0000
## Max. :1.0000
##
## Dependents tenure PhoneService MultipleLines
## Length:7043 Min. : 0.00 Length:7043 Length:7043
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :29.00 Mode :character Mode :character
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## TechSupport StreamingTV StreamingMovies Contract
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## Length:7043 Length:7043 Min. : 18.25 Min. : 18.8
## Class :character Class :character 1st Qu.: 35.50 1st Qu.: 401.4
## Mode :character Mode :character Median : 70.35 Median :1397.5
## Mean : 64.76 Mean :2283.3
## 3rd Qu.: 89.85 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
## NA's :11
## Churn
## Length:7043
## Class :character
## Mode :character
##
##
##
##
str(telco)
## 'data.frame': 7043 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
Preparar los datos
#Eliminar customerID
telco_limpia <- telco %>% select(-customerID)
#Convertir las variables carácter a factores
telco_limpia <- telco_limpia %>% mutate(across(where(is.character),as.factor))
str(telco_limpia)
## 'data.frame': 7043 obs. of 20 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
#Eliminar na's
telco_limpia <- na.omit(telco_limpia)
Entrenar el modelo
set.seed(123)
renglones_entrenamiento_telco <- createDataPartition(telco_limpia$Churn, p=.7, list = FALSE)
entrenamiento_telco <- telco_limpia[renglones_entrenamiento_telco, ]
prueba_telco <- telco_limpia[-renglones_entrenamiento_telco, ]
Generar el modelo
modelo_telco <- glm(Churn ~ ., data=entrenamiento_telco, family = binomial)
summary(modelo_telco)
##
## Call:
## glm(formula = Churn ~ ., family = binomial, data = entrenamiento_telco)
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0746695 0.9571525 1.123 0.261532
## genderMale -0.0482269 0.0771304 -0.625 0.531797
## SeniorCitizen 0.1200467 0.1014491 1.183 0.236683
## PartnerYes -0.0910761 0.0918164 -0.992 0.321228
## DependentsYes -0.0843260 0.1068161 -0.789 0.429849
## tenure -0.0621135 0.0074487 -8.339 < 2e-16 ***
## PhoneServiceYes 0.0233358 0.7635434 0.031 0.975618
## MultipleLinesNo phone service NA NA NA NA
## MultipleLinesYes 0.3794100 0.2078464 1.825 0.067935 .
## InternetServiceFiber optic 1.5613737 0.9372764 1.666 0.095741 .
## InternetServiceNo -1.5047493 0.9480893 -1.587 0.112481
## OnlineSecurityNo internet service NA NA NA NA
## OnlineSecurityYes -0.2149292 0.2098780 -1.024 0.305804
## OnlineBackupNo internet service NA NA NA NA
## OnlineBackupYes 0.0008494 0.2053088 0.004 0.996699
## DeviceProtectionNo internet service NA NA NA NA
## DeviceProtectionYes 0.1177891 0.2084585 0.565 0.572041
## TechSupportNo internet service NA NA NA NA
## TechSupportYes -0.1280168 0.2128673 -0.601 0.547579
## StreamingTVNo internet service NA NA NA NA
## StreamingTVYes 0.5187839 0.3819512 1.358 0.174386
## StreamingMoviesNo internet service NA NA NA NA
## StreamingMoviesYes 0.5783808 0.3840289 1.506 0.132045
## ContractOne year -0.8451411 0.1304035 -6.481 9.11e-11 ***
## ContractTwo year -1.4644194 0.2039949 -7.179 7.04e-13 ***
## PaperlessBillingYes 0.3020753 0.0884590 3.415 0.000638 ***
## PaymentMethodCredit card (automatic) -0.0093796 0.1357499 -0.069 0.944914
## PaymentMethodElectronic check 0.2897469 0.1124649 2.576 0.009985 **
## PaymentMethodMailed check -0.1002695 0.1367827 -0.733 0.463524
## MonthlyCharges -0.0336622 0.0372618 -0.903 0.366314
## TotalCharges 0.0003653 0.0000844 4.328 1.50e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5702.8 on 4923 degrees of freedom
## Residual deviance: 4117.1 on 4900 degrees of freedom
## AIC: 4165.1
##
## Number of Fisher Scoring iterations: 6
Evaluar el modelo
# Predicción en entrenamiento
prediccion_entrenamiento_telco <- predict(modelo_telco, entrenamiento_telco,type = "response")
resultados_prediccion_entrenamiento_telco <- ifelse(prediccion_entrenamiento_telco >=.5, "Yes", "No")
#Matriz de confusión en entrenamiento
mcet <- confusionMatrix(factor(resultados_prediccion_entrenamiento_telco, levels=c("Yes","No")), entrenamiento_telco$Churn)
## Warning in
## confusionMatrix.default(factor(resultados_prediccion_entrenamiento_telco, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
mcet
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3237 598
## Yes 378 711
##
## Accuracy : 0.8018
## 95% CI : (0.7904, 0.8128)
## No Information Rate : 0.7342
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4634
##
## Mcnemar's Test P-Value : 2.383e-12
##
## Sensitivity : 0.8954
## Specificity : 0.5432
## Pos Pred Value : 0.8441
## Neg Pred Value : 0.6529
## Prevalence : 0.7342
## Detection Rate : 0.6574
## Detection Prevalence : 0.7788
## Balanced Accuracy : 0.7193
##
## 'Positive' Class : No
##
prediccion_prueba_telco <- predict(modelo_telco, prueba_telco, type = "response")
resultados_prediccion_prueba_telco <- ifelse(prediccion_prueba_telco >= .5, "Yes", "No")
# Matriz de confusión en prueba
mcpr <- confusionMatrix(factor(resultados_prediccion_prueba_telco, levels = c("Yes","No")), prueba_telco$Churn)
## Warning in confusionMatrix.default(factor(resultados_prediccion_prueba_telco, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
mcpr
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1385 229
## Yes 163 331
##
## Accuracy : 0.814
## 95% CI : (0.7968, 0.8304)
## No Information Rate : 0.7343
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5048
##
## Mcnemar's Test P-Value : 0.001027
##
## Sensitivity : 0.8947
## Specificity : 0.5911
## Pos Pred Value : 0.8581
## Neg Pred Value : 0.6700
## Prevalence : 0.7343
## Detection Rate : 0.6570
## Detection Prevalence : 0.7657
## Balanced Accuracy : 0.7429
##
## 'Positive' Class : No
##
Ejemplo Predicción
info_cliente <- telco_limpia[1, ]
info_cliente1 <- info_cliente %>% select(-Churn)
probabilidad_cliente <- predict(modelo_telco, info_cliente1, type = "response")
probabilidad_cliente
## 1
## 0.6272032
prediccion_cliente <- ifelse(probabilidad_cliente >=.5, "Yes", "No")
prediccion_cliente
## 1
## "Yes"