Contexto

Llamar paquetes e librerias

#install.packages("tidyverse")
#install.packages("caret")
#install.packages("e1071")
library(tidyverse)
library(caret)
library(e1071)

Importar base de datos

telco <- read.csv("C:\\Users\\admin\\Downloads\\Telco Customer Churn.csv")

Entender los datos

head(telco)
##   customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## 1 7590-VHVEG Female             0     Yes         No      1           No
## 2 5575-GNVDE   Male             0      No         No     34          Yes
## 3 3668-QPYBK   Male             0      No         No      2          Yes
## 4 7795-CFOCW   Male             0      No         No     45           No
## 5 9237-HQITU Female             0      No         No      2          Yes
## 6 9305-CDSKC Female             0      No         No      8          Yes
##      MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection
## 1 No phone service             DSL             No          Yes               No
## 2               No             DSL            Yes           No              Yes
## 3               No             DSL            Yes          Yes               No
## 4 No phone service             DSL            Yes           No              Yes
## 5               No     Fiber optic             No           No               No
## 6              Yes     Fiber optic             No           No              Yes
##   TechSupport StreamingTV StreamingMovies       Contract PaperlessBilling
## 1          No          No              No Month-to-month              Yes
## 2          No          No              No       One year               No
## 3          No          No              No Month-to-month              Yes
## 4         Yes          No              No       One year               No
## 5          No          No              No Month-to-month              Yes
## 6          No         Yes             Yes Month-to-month              Yes
##               PaymentMethod MonthlyCharges TotalCharges Churn
## 1          Electronic check          29.85        29.85    No
## 2              Mailed check          56.95      1889.50    No
## 3              Mailed check          53.85       108.15   Yes
## 4 Bank transfer (automatic)          42.30      1840.75    No
## 5          Electronic check          70.70       151.65   Yes
## 6          Electronic check          99.65       820.50   Yes
summary(telco)
##   customerID           gender          SeniorCitizen      Partner         
##  Length:7043        Length:7043        Min.   :0.0000   Length:7043       
##  Class :character   Class :character   1st Qu.:0.0000   Class :character  
##  Mode  :character   Mode  :character   Median :0.0000   Mode  :character  
##                                        Mean   :0.1621                     
##                                        3rd Qu.:0.0000                     
##                                        Max.   :1.0000                     
##                                                                           
##   Dependents            tenure      PhoneService       MultipleLines     
##  Length:7043        Min.   : 0.00   Length:7043        Length:7043       
##  Class :character   1st Qu.: 9.00   Class :character   Class :character  
##  Mode  :character   Median :29.00   Mode  :character   Mode  :character  
##                     Mean   :32.37                                        
##                     3rd Qu.:55.00                                        
##                     Max.   :72.00                                        
##                                                                          
##  InternetService    OnlineSecurity     OnlineBackup       DeviceProtection  
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  TechSupport        StreamingTV        StreamingMovies      Contract        
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PaperlessBilling   PaymentMethod      MonthlyCharges    TotalCharges   
##  Length:7043        Length:7043        Min.   : 18.25   Min.   :  18.8  
##  Class :character   Class :character   1st Qu.: 35.50   1st Qu.: 401.4  
##  Mode  :character   Mode  :character   Median : 70.35   Median :1397.5  
##                                        Mean   : 64.76   Mean   :2283.3  
##                                        3rd Qu.: 89.85   3rd Qu.:3794.7  
##                                        Max.   :118.75   Max.   :8684.8  
##                                                         NA's   :11      
##     Churn          
##  Length:7043       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
str(telco)
## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...

Preparar los datos

#Eliminar customerID
telco_limpia <- telco %>% select(-customerID)
#Convertir las variables carácter a factores
telco_limpia <- telco_limpia %>% mutate(across(where(is.character),as.factor))
str(telco_limpia)
## 'data.frame':    7043 obs. of  20 variables:
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
#Eliminar na's
telco_limpia <- na.omit(telco_limpia)

Entrenar el modelo

set.seed(123)
renglones_entrenamiento_telco <- createDataPartition(telco_limpia$Churn, p=.7, list = FALSE)
entrenamiento_telco <- telco_limpia[renglones_entrenamiento_telco, ]
prueba_telco <- telco_limpia[-renglones_entrenamiento_telco, ]

Generar el modelo

modelo_telco <- glm(Churn ~ ., data=entrenamiento_telco, family = binomial)
summary(modelo_telco)
## 
## Call:
## glm(formula = Churn ~ ., family = binomial, data = entrenamiento_telco)
## 
## Coefficients: (7 not defined because of singularities)
##                                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           1.0746695  0.9571525   1.123 0.261532    
## genderMale                           -0.0482269  0.0771304  -0.625 0.531797    
## SeniorCitizen                         0.1200467  0.1014491   1.183 0.236683    
## PartnerYes                           -0.0910761  0.0918164  -0.992 0.321228    
## DependentsYes                        -0.0843260  0.1068161  -0.789 0.429849    
## tenure                               -0.0621135  0.0074487  -8.339  < 2e-16 ***
## PhoneServiceYes                       0.0233358  0.7635434   0.031 0.975618    
## MultipleLinesNo phone service                NA         NA      NA       NA    
## MultipleLinesYes                      0.3794100  0.2078464   1.825 0.067935 .  
## InternetServiceFiber optic            1.5613737  0.9372764   1.666 0.095741 .  
## InternetServiceNo                    -1.5047493  0.9480893  -1.587 0.112481    
## OnlineSecurityNo internet service            NA         NA      NA       NA    
## OnlineSecurityYes                    -0.2149292  0.2098780  -1.024 0.305804    
## OnlineBackupNo internet service              NA         NA      NA       NA    
## OnlineBackupYes                       0.0008494  0.2053088   0.004 0.996699    
## DeviceProtectionNo internet service          NA         NA      NA       NA    
## DeviceProtectionYes                   0.1177891  0.2084585   0.565 0.572041    
## TechSupportNo internet service               NA         NA      NA       NA    
## TechSupportYes                       -0.1280168  0.2128673  -0.601 0.547579    
## StreamingTVNo internet service               NA         NA      NA       NA    
## StreamingTVYes                        0.5187839  0.3819512   1.358 0.174386    
## StreamingMoviesNo internet service           NA         NA      NA       NA    
## StreamingMoviesYes                    0.5783808  0.3840289   1.506 0.132045    
## ContractOne year                     -0.8451411  0.1304035  -6.481 9.11e-11 ***
## ContractTwo year                     -1.4644194  0.2039949  -7.179 7.04e-13 ***
## PaperlessBillingYes                   0.3020753  0.0884590   3.415 0.000638 ***
## PaymentMethodCredit card (automatic) -0.0093796  0.1357499  -0.069 0.944914    
## PaymentMethodElectronic check         0.2897469  0.1124649   2.576 0.009985 ** 
## PaymentMethodMailed check            -0.1002695  0.1367827  -0.733 0.463524    
## MonthlyCharges                       -0.0336622  0.0372618  -0.903 0.366314    
## TotalCharges                          0.0003653  0.0000844   4.328 1.50e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5702.8  on 4923  degrees of freedom
## Residual deviance: 4117.1  on 4900  degrees of freedom
## AIC: 4165.1
## 
## Number of Fisher Scoring iterations: 6

Evaluar el modelo

# Predicción en entrenamiento
prediccion_entrenamiento_telco <- predict(modelo_telco, entrenamiento_telco,type = "response")
resultados_prediccion_entrenamiento_telco <- ifelse(prediccion_entrenamiento_telco >=.5, "Yes", "No")
#Matriz de confusión en entrenamiento
mcet <- confusionMatrix(factor(resultados_prediccion_entrenamiento_telco, levels=c("Yes","No")), entrenamiento_telco$Churn)
## Warning in
## confusionMatrix.default(factor(resultados_prediccion_entrenamiento_telco, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
mcet
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  3237  598
##        Yes  378  711
##                                           
##                Accuracy : 0.8018          
##                  95% CI : (0.7904, 0.8128)
##     No Information Rate : 0.7342          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4634          
##                                           
##  Mcnemar's Test P-Value : 2.383e-12       
##                                           
##             Sensitivity : 0.8954          
##             Specificity : 0.5432          
##          Pos Pred Value : 0.8441          
##          Neg Pred Value : 0.6529          
##              Prevalence : 0.7342          
##          Detection Rate : 0.6574          
##    Detection Prevalence : 0.7788          
##       Balanced Accuracy : 0.7193          
##                                           
##        'Positive' Class : No              
## 
prediccion_prueba_telco <- predict(modelo_telco, prueba_telco, type = "response")
resultados_prediccion_prueba_telco <- ifelse(prediccion_prueba_telco >= .5, "Yes", "No")

# Matriz de confusión en prueba
mcpr <- confusionMatrix(factor(resultados_prediccion_prueba_telco, levels = c("Yes","No")), prueba_telco$Churn)
## Warning in confusionMatrix.default(factor(resultados_prediccion_prueba_telco, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
mcpr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1385  229
##        Yes  163  331
##                                           
##                Accuracy : 0.814           
##                  95% CI : (0.7968, 0.8304)
##     No Information Rate : 0.7343          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5048          
##                                           
##  Mcnemar's Test P-Value : 0.001027        
##                                           
##             Sensitivity : 0.8947          
##             Specificity : 0.5911          
##          Pos Pred Value : 0.8581          
##          Neg Pred Value : 0.6700          
##              Prevalence : 0.7343          
##          Detection Rate : 0.6570          
##    Detection Prevalence : 0.7657          
##       Balanced Accuracy : 0.7429          
##                                           
##        'Positive' Class : No              
## 

Ejemplo Predicción

info_cliente <- telco_limpia[1, ]
info_cliente1 <- info_cliente %>% select(-Churn)
probabilidad_cliente <- predict(modelo_telco, info_cliente1, type = "response")
probabilidad_cliente
##         1 
## 0.6272032
prediccion_cliente <- ifelse(probabilidad_cliente >=.5, "Yes", "No")
prediccion_cliente
##     1 
## "Yes"