Telco es una empresa internacional que vende una variedad de servicios de telecomunicaciones. Chirn es un indicador de la diserción de clientes, fenómeno en el cual los clientes cancelas sus servicios con la empresa. El objetivo de este trabajo es predecir si un cliente anadonará o se quedará en la empresa.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
#install.packages(e1071) #Matríz de confusión
#library(e1071)
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
telco <- read.csv("/Users/anapaualvear/Downloads/Telco Customer Churn.csv")
data("customer_churn_tbl")
telco <- customer_churn_tbl
summary(telco)
## customerID gender SeniorCitizen Partner
## Length:7043 Length:7043 Min. :0.0000 Length:7043
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :0.0000 Mode :character
## Mean :0.1621
## 3rd Qu.:0.0000
## Max. :1.0000
##
## Dependents tenure PhoneService MultipleLines
## Length:7043 Min. : 0.00 Length:7043 Length:7043
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :29.00 Mode :character Mode :character
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## TechSupport StreamingTV StreamingMovies Contract
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## Length:7043 Length:7043 Min. : 18.25 Min. : 18.8
## Class :character Class :character 1st Qu.: 35.50 1st Qu.: 401.4
## Mode :character Mode :character Median : 70.35 Median :1397.5
## Mean : 64.76 Mean :2283.3
## 3rd Qu.: 89.85 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
## NA's :11
## Churn
## Length:7043
## Class :character
## Mode :character
##
##
##
##
head(telco)
## # A tibble: 6 × 21
## customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 7590-VHVEG Female 0 Yes No 1 No
## 2 5575-GNVDE Male 0 No No 34 Yes
## 3 3668-QPYBK Male 0 No No 2 Yes
## 4 7795-CFOCW Male 0 No No 45 No
## 5 9237-HQITU Female 0 No No 2 Yes
## 6 9305-CDSKC Female 0 No No 8 Yes
## # ℹ 14 more variables: MultipleLines <chr>, InternetService <chr>,
## # OnlineSecurity <chr>, OnlineBackup <chr>, DeviceProtection <chr>,
## # TechSupport <chr>, StreamingTV <chr>, StreamingMovies <chr>,
## # Contract <chr>, PaperlessBilling <chr>, PaymentMethod <chr>,
## # MonthlyCharges <dbl>, TotalCharges <dbl>, Churn <chr>
str(telco)
## spc_tbl_ [7,043 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ customerID : chr [1:7043] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr [1:7043] "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : num [1:7043] 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr [1:7043] "Yes" "No" "No" "No" ...
## $ Dependents : chr [1:7043] "No" "No" "No" "No" ...
## $ tenure : num [1:7043] 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr [1:7043] "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr [1:7043] "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr [1:7043] "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr [1:7043] "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr [1:7043] "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr [1:7043] "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr [1:7043] "No" "No" "No" "Yes" ...
## $ StreamingTV : chr [1:7043] "No" "No" "No" "No" ...
## $ StreamingMovies : chr [1:7043] "No" "No" "No" "No" ...
## $ Contract : chr [1:7043] "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr [1:7043] "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr [1:7043] "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num [1:7043] 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num [1:7043] 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr [1:7043] "No" "No" "Yes" "No" ...
## - attr(*, "spec")=
## .. cols(
## .. customerID = col_character(),
## .. gender = col_character(),
## .. SeniorCitizen = col_double(),
## .. Partner = col_character(),
## .. Dependents = col_character(),
## .. tenure = col_double(),
## .. PhoneService = col_character(),
## .. MultipleLines = col_character(),
## .. InternetService = col_character(),
## .. OnlineSecurity = col_character(),
## .. OnlineBackup = col_character(),
## .. DeviceProtection = col_character(),
## .. TechSupport = col_character(),
## .. StreamingTV = col_character(),
## .. StreamingMovies = col_character(),
## .. Contract = col_character(),
## .. PaperlessBilling = col_character(),
## .. PaymentMethod = col_character(),
## .. MonthlyCharges = col_double(),
## .. TotalCharges = col_double(),
## .. Churn = col_character()
## .. )
La Regresión Logística es un modelo que se utiliza
para predecir la probabilidad de que ocurra un evento, basado en una o
más variables independientes. A diferencia de la regresión líneal, que
predice valores conttinuos (como el prrcio de una casa), la regresión
logóstoca predice resultados binarios o
categóricos, como un si/no, 1/0, verdadero/falso.
Si la probabilidad es mayor que 0.5, el modelo predice que el evento
ocurrirá (Ej. el cliente se dará de baja).
Si la probabilidad es menor de 0.5 predice el evento no ocurrirá.
#Eliminar columna de "CustomerID"
telco_limpia <- telco %>% select (-customerID)
#Convertir las variables caracter a factores
telco_limpia <- telco_limpia %>% mutate(across(where(is.character), (as.factor)))
str(telco_limpia)
## tibble [7,043 × 20] (S3: tbl_df/tbl/data.frame)
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
## $ SeniorCitizen : num [1:7043] 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ tenure : num [1:7043] 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ MonthlyCharges : num [1:7043] 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num [1:7043] 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
#Eliminar NA's
telco_limpia <- na.omit(telco_limpia)
set.seed(123)
renglones_entrenamiento_telco <- createDataPartition(telco_limpia$Churn, p = 0.7, list = FALSE)
entrenamiento_telco <- telco_limpia[renglones_entrenamiento_telco, ]
prueba_telco <- telco_limpia[-renglones_entrenamiento_telco, ]
modelo_telco <- glm(Churn ~ ., data=entrenamiento_telco, family=binomial)
summary(modelo_telco)
##
## Call:
## glm(formula = Churn ~ ., family = binomial, data = entrenamiento_telco)
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0746695 0.9571525 1.123 0.261532
## genderMale -0.0482269 0.0771304 -0.625 0.531797
## SeniorCitizen 0.1200467 0.1014491 1.183 0.236683
## PartnerYes -0.0910761 0.0918164 -0.992 0.321228
## DependentsYes -0.0843260 0.1068161 -0.789 0.429849
## tenure -0.0621135 0.0074487 -8.339 < 2e-16 ***
## PhoneServiceYes 0.0233358 0.7635434 0.031 0.975618
## MultipleLinesNo phone service NA NA NA NA
## MultipleLinesYes 0.3794100 0.2078464 1.825 0.067935 .
## InternetServiceFiber optic 1.5613737 0.9372764 1.666 0.095741 .
## InternetServiceNo -1.5047493 0.9480893 -1.587 0.112481
## OnlineSecurityNo internet service NA NA NA NA
## OnlineSecurityYes -0.2149292 0.2098780 -1.024 0.305804
## OnlineBackupNo internet service NA NA NA NA
## OnlineBackupYes 0.0008494 0.2053088 0.004 0.996699
## DeviceProtectionNo internet service NA NA NA NA
## DeviceProtectionYes 0.1177891 0.2084585 0.565 0.572041
## TechSupportNo internet service NA NA NA NA
## TechSupportYes -0.1280168 0.2128673 -0.601 0.547579
## StreamingTVNo internet service NA NA NA NA
## StreamingTVYes 0.5187839 0.3819512 1.358 0.174386
## StreamingMoviesNo internet service NA NA NA NA
## StreamingMoviesYes 0.5783808 0.3840289 1.506 0.132045
## ContractOne year -0.8451411 0.1304035 -6.481 9.11e-11 ***
## ContractTwo year -1.4644194 0.2039949 -7.179 7.04e-13 ***
## PaperlessBillingYes 0.3020753 0.0884590 3.415 0.000638 ***
## PaymentMethodCredit card (automatic) -0.0093796 0.1357499 -0.069 0.944914
## PaymentMethodElectronic check 0.2897469 0.1124649 2.576 0.009985 **
## PaymentMethodMailed check -0.1002695 0.1367827 -0.733 0.463524
## MonthlyCharges -0.0336622 0.0372618 -0.903 0.366314
## TotalCharges 0.0003653 0.0000844 4.328 1.50e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5702.8 on 4923 degrees of freedom
## Residual deviance: 4117.1 on 4900 degrees of freedom
## AIC: 4165.1
##
## Number of Fisher Scoring iterations: 6
#Predicción en Entrenamiento
prediccion_entrenamiento_telco <- predict(modelo_telco, entrenamiento_telco, type="response")
resultado_prediccion_entrenamiento_telco <- ifelse(prediccion_entrenamiento_telco>=0.5,"Yes","No")
#Matríz de confusión de en Entrenamiento
mcet <- confusionMatrix(factor(resultado_prediccion_entrenamiento_telco, levels=c("Yes","No")), entrenamiento_telco$Churn)
## Warning in
## confusionMatrix.default(factor(resultado_prediccion_entrenamiento_telco, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
mcet
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3237 598
## Yes 378 711
##
## Accuracy : 0.8018
## 95% CI : (0.7904, 0.8128)
## No Information Rate : 0.7342
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4634
##
## Mcnemar's Test P-Value : 2.383e-12
##
## Sensitivity : 0.8954
## Specificity : 0.5432
## Pos Pred Value : 0.8441
## Neg Pred Value : 0.6529
## Prevalence : 0.7342
## Detection Rate : 0.6574
## Detection Prevalence : 0.7788
## Balanced Accuracy : 0.7193
##
## 'Positive' Class : No
##
#Predicción en Prueba
prediccion_prueba_telco <- predict(modelo_telco, prueba_telco, type="response")
resultado_prediccion_prueba_telco <- ifelse(prediccion_prueba_telco>=0.5,"Yes","No")
#Matríz de confusión de en Prueba
mcep <- confusionMatrix(factor(resultado_prediccion_prueba_telco, levels=c("Yes","no")), prueba_telco$Churn)
## Warning in confusionMatrix.default(factor(resultado_prediccion_prueba_telco, :
## The data contains levels not found in the data, but they are empty and will be
## dropped.
## Warning in confusionMatrix.default(factor(resultado_prediccion_prueba_telco, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
mcep
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 0 0
## Yes 163 331
##
## Accuracy : 0.67
## 95% CI : (0.6266, 0.7114)
## No Information Rate : 0.67
## P-Value [Acc > NIR] : 0.5212
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00
## Specificity : 1.00
## Pos Pred Value : NaN
## Neg Pred Value : 0.67
## Prevalence : 0.33
## Detection Rate : 0.00
## Detection Prevalence : 0.00
## Balanced Accuracy : 0.50
##
## 'Positive' Class : No
##
info_cliente <- telco_limpia[11, ]
info_cliente <- info_cliente %>% select(-Churn)
probabilidad_cliente <- predict (modelo_telco, info_cliente, type="response")
probabilidad_cliente
## 1
## 0.1957503
prediccion_cliente <- ifelse(probabilidad_cliente>=0.5,"Yes","No")
prediccion_cliente
## 1
## "No"