Customer Churn

Packages

# install.packages("caret")
library(caret)
# install.packages("randomForest")
library(randomForest)

Cargar Datos

churn <- read.csv("customer_churn.csv")
churn <- na.omit(churn) # Eliminar filas con valores faltantes

Entender Base de Datos

summary(churn)
##    CustomerID          Age           Gender              Tenure     
##  Min.   :     2   Min.   :18.00   Length:440832      Min.   : 1.00  
##  1st Qu.:113622   1st Qu.:29.00   Class :character   1st Qu.:16.00  
##  Median :226126   Median :39.00   Mode  :character   Median :32.00  
##  Mean   :225399   Mean   :39.37                      Mean   :31.26  
##  3rd Qu.:337739   3rd Qu.:48.00                      3rd Qu.:46.00  
##  Max.   :449999   Max.   :65.00                      Max.   :60.00  
##  Usage.Frequency Support.Calls    Payment.Delay   Subscription.Type 
##  Min.   : 1.00   Min.   : 0.000   Min.   : 0.00   Length:440832     
##  1st Qu.: 9.00   1st Qu.: 1.000   1st Qu.: 6.00   Class :character  
##  Median :16.00   Median : 3.000   Median :12.00   Mode  :character  
##  Mean   :15.81   Mean   : 3.604   Mean   :12.97                     
##  3rd Qu.:23.00   3rd Qu.: 6.000   3rd Qu.:19.00                     
##  Max.   :30.00   Max.   :10.000   Max.   :30.00                     
##  Contract.Length     Total.Spend     Last.Interaction     Churn       
##  Length:440832      Min.   : 100.0   Min.   : 1.00    Min.   :0.0000  
##  Class :character   1st Qu.: 480.0   1st Qu.: 7.00    1st Qu.:0.0000  
##  Mode  :character   Median : 661.0   Median :14.00    Median :1.0000  
##                     Mean   : 631.6   Mean   :14.48    Mean   :0.5671  
##                     3rd Qu.: 830.0   3rd Qu.:22.00    3rd Qu.:1.0000  
##                     Max.   :1000.0   Max.   :30.00    Max.   :1.0000
str(churn)
## 'data.frame':    440832 obs. of  12 variables:
##  $ CustomerID       : int  2 3 4 5 6 8 9 10 11 12 ...
##  $ Age              : int  30 65 55 58 23 51 58 55 39 64 ...
##  $ Gender           : chr  "Female" "Female" "Female" "Male" ...
##  $ Tenure           : int  39 49 14 38 32 33 49 37 12 3 ...
##  $ Usage.Frequency  : int  14 1 4 21 20 25 12 8 5 25 ...
##  $ Support.Calls    : int  5 10 6 7 5 9 3 4 7 2 ...
##  $ Payment.Delay    : int  18 8 18 7 8 26 16 15 4 11 ...
##  $ Subscription.Type: chr  "Standard" "Basic" "Basic" "Standard" ...
##  $ Contract.Length  : chr  "Annual" "Monthly" "Quarterly" "Monthly" ...
##  $ Total.Spend      : num  932 557 185 396 617 129 821 445 969 415 ...
##  $ Last.Interaction : int  17 6 3 29 20 8 24 30 13 29 ...
##  $ Churn            : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int 199296
##   ..- attr(*, "names")= chr "199296"
head(churn)
##   CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1          2  30 Female     39              14             5            18
## 2          3  65 Female     49               1            10             8
## 3          4  55 Female     14               4             6            18
## 4          5  58   Male     38              21             7             7
## 5          6  23   Male     32              20             5             8
## 6          8  51   Male     33              25             9            26
##   Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1          Standard          Annual         932               17     1
## 2             Basic         Monthly         557                6     1
## 3             Basic       Quarterly         185                3     1
## 4          Standard         Monthly         396               29     1
## 5             Basic         Monthly         617               20     1
## 6           Premium          Annual         129                8     1
churn$Churn <- as.factor(churn$Churn)
churn$Contract.Length <- as.factor(churn$Contract.Length)
churn$Subscription.Type <- as.factor(churn$Subscription.Type)

str(churn)
## 'data.frame':    440832 obs. of  12 variables:
##  $ CustomerID       : int  2 3 4 5 6 8 9 10 11 12 ...
##  $ Age              : int  30 65 55 58 23 51 58 55 39 64 ...
##  $ Gender           : chr  "Female" "Female" "Female" "Male" ...
##  $ Tenure           : int  39 49 14 38 32 33 49 37 12 3 ...
##  $ Usage.Frequency  : int  14 1 4 21 20 25 12 8 5 25 ...
##  $ Support.Calls    : int  5 10 6 7 5 9 3 4 7 2 ...
##  $ Payment.Delay    : int  18 8 18 7 8 26 16 15 4 11 ...
##  $ Subscription.Type: Factor w/ 3 levels "Basic","Premium",..: 3 1 1 3 1 2 3 2 3 3 ...
##  $ Contract.Length  : Factor w/ 3 levels "Annual","Monthly",..: 1 2 3 2 2 1 3 1 3 3 ...
##  $ Total.Spend      : num  932 557 185 396 617 129 821 445 969 415 ...
##  $ Last.Interaction : int  17 6 3 29 20 8 24 30 13 29 ...
##  $ Churn            : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int 199296
##   ..- attr(*, "names")= chr "199296"

Partir Datos

set.seed(123)
renglones.entrenamiento <- createDataPartition(churn$Churn, p = 0.8, list = FALSE)
entrenamiento <- churn[renglones.entrenamiento, ]
prueba <- churn[-renglones.entrenamiento, ]

Entrenar Modelo

modelo <- randomForest(Churn ~ . - CustomerID, data = entrenamiento, ntree = 100, importance = TRUE)

Matriz de Confusión

resultado_entrenamiento <- predict(modelo, entrenamiento)
resultado_prueba <- predict(modelo, prueba)

matriz_entrenamiento <- confusionMatrix(resultado_entrenamiento, entrenamiento$Churn)
matriz_prueba <- confusionMatrix(resultado_prueba, prueba$Churn)

print(matriz_entrenamiento)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 152667      0
##          1      0 200000
##                                    
##                Accuracy : 1        
##                  95% CI : (1, 1)   
##     No Information Rate : 0.5671   
##     P-Value [Acc > NIR] : < 2.2e-16
##                                    
##                   Kappa : 1        
##                                    
##  Mcnemar's Test P-Value : NA       
##                                    
##             Sensitivity : 1.0000   
##             Specificity : 1.0000   
##          Pos Pred Value : 1.0000   
##          Neg Pred Value : 1.0000   
##              Prevalence : 0.4329   
##          Detection Rate : 0.4329   
##    Detection Prevalence : 0.4329   
##       Balanced Accuracy : 1.0000   
##                                    
##        'Positive' Class : 0        
## 
print(matriz_prueba)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 38165    10
##          1     1 49989
##                                           
##                Accuracy : 0.9999          
##                  95% CI : (0.9998, 0.9999)
##     No Information Rate : 0.5671          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9997          
##                                           
##  Mcnemar's Test P-Value : 0.01586         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9998          
##          Pos Pred Value : 0.9997          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.4329          
##          Detection Rate : 0.4329          
##    Detection Prevalence : 0.4330          
##       Balanced Accuracy : 0.9999          
##                                           
##        'Positive' Class : 0               
## 

Graficas

plot(modelo)

varImpPlot(modelo)

LS0tCnRpdGxlOiAiIgphdXRob3I6ICJNYXJjZWxvIFJleWVzIEEwMTcyMzMyMSIKZGF0ZTogIjIwMjYtMDItMjUiCm91dHB1dDoKICBodG1sX2RvY3VtZW50OgogICAgdG9jOiBUUlVFCiAgICB0b2NfZGVwdGg6IDIKICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUKICAgIHRoZW1lOiBjZXJ1bGVhbgotLS0KCjxoMSBzdHlsZT0iY29sb3I6IHJlZDsgZm9udC1mYW1pbHk6ICdJbXBhY3QnLCAnQXJpYWwgQmxhY2snLCBzYW5zLXNlcmlmOyBmb250LXNpemU6IDZlbTsgZm9udC13ZWlnaHQ6IDkwMDsgdGV4dC10cmFuc2Zvcm06IHVwcGVyY2FzZTsgbGluZS1oZWlnaHQ6IDE7IHRleHQtYWxpZ246IGNlbnRlcjsiPkN1c3RvbWVyIENodXJuPC9oMT4KPGgyIHN0eWxlPSJjb2xvcjogYmx1ZTsgZm9udC1mYW1pbHk6ICdJbXBhY3QnLCAnQXJpYWwgQmxhY2snLCBzYW5zLXNlcmlmOyBmb250LXNpemU6IDRlbTsgZm9udC13ZWlnaHQ6IDkwMDsgdGV4dC10cmFuc2Zvcm06IHVwcGVyY2FzZTsgbGluZS1oZWlnaHQ6IDE7Ij5QYWNrYWdlczwvaDI+CgpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQojIGluc3RhbGwucGFja2FnZXMoImNhcmV0IikKbGlicmFyeShjYXJldCkKIyBpbnN0YWxsLnBhY2thZ2VzKCJyYW5kb21Gb3Jlc3QiKQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkKYGBgCgo8aDIgc3R5bGU9ImNvbG9yOiBibHVlOyBmb250LWZhbWlseTogJ0ltcGFjdCcsICdBcmlhbCBCbGFjaycsIHNhbnMtc2VyaWY7IGZvbnQtc2l6ZTogNGVtOyBmb250LXdlaWdodDogOTAwOyB0ZXh0LXRyYW5zZm9ybTogdXBwZXJjYXNlOyBsaW5lLWhlaWdodDogMTsiPkNhcmdhciBEYXRvczwvaDI+CgpgYGB7cn0KY2h1cm4gPC0gcmVhZC5jc3YoImN1c3RvbWVyX2NodXJuLmNzdiIpCmNodXJuIDwtIG5hLm9taXQoY2h1cm4pICMgRWxpbWluYXIgZmlsYXMgY29uIHZhbG9yZXMgZmFsdGFudGVzCmBgYAoKPGgyIHN0eWxlPSJjb2xvcjogYmx1ZTsgZm9udC1mYW1pbHk6ICdJbXBhY3QnLCAnQXJpYWwgQmxhY2snLCBzYW5zLXNlcmlmOyBmb250LXNpemU6IDRlbTsgZm9udC13ZWlnaHQ6IDkwMDsgdGV4dC10cmFuc2Zvcm06IHVwcGVyY2FzZTsgbGluZS1oZWlnaHQ6IDE7Ij5FbnRlbmRlciBCYXNlIGRlIERhdG9zPC9oMj4KCmBgYHtyfQpzdW1tYXJ5KGNodXJuKQpzdHIoY2h1cm4pCmhlYWQoY2h1cm4pCgpjaHVybiRDaHVybiA8LSBhcy5mYWN0b3IoY2h1cm4kQ2h1cm4pCmNodXJuJENvbnRyYWN0Lkxlbmd0aCA8LSBhcy5mYWN0b3IoY2h1cm4kQ29udHJhY3QuTGVuZ3RoKQpjaHVybiRTdWJzY3JpcHRpb24uVHlwZSA8LSBhcy5mYWN0b3IoY2h1cm4kU3Vic2NyaXB0aW9uLlR5cGUpCgpzdHIoY2h1cm4pCmBgYAoKPGgyIHN0eWxlPSJjb2xvcjogYmx1ZTsgZm9udC1mYW1pbHk6ICdJbXBhY3QnLCAnQXJpYWwgQmxhY2snLCBzYW5zLXNlcmlmOyBmb250LXNpemU6IDRlbTsgZm9udC13ZWlnaHQ6IDkwMDsgdGV4dC10cmFuc2Zvcm06IHVwcGVyY2FzZTsgbGluZS1oZWlnaHQ6IDE7Ij5QYXJ0aXIgRGF0b3M8L2gyPgoKYGBge3J9CnNldC5zZWVkKDEyMykKcmVuZ2xvbmVzLmVudHJlbmFtaWVudG8gPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihjaHVybiRDaHVybiwgcCA9IDAuOCwgbGlzdCA9IEZBTFNFKQplbnRyZW5hbWllbnRvIDwtIGNodXJuW3Jlbmdsb25lcy5lbnRyZW5hbWllbnRvLCBdCnBydWViYSA8LSBjaHVyblstcmVuZ2xvbmVzLmVudHJlbmFtaWVudG8sIF0KYGBgCgo8aDIgc3R5bGU9ImNvbG9yOiBibHVlOyBmb250LWZhbWlseTogJ0ltcGFjdCcsICdBcmlhbCBCbGFjaycsIHNhbnMtc2VyaWY7IGZvbnQtc2l6ZTogNGVtOyBmb250LXdlaWdodDogOTAwOyB0ZXh0LXRyYW5zZm9ybTogdXBwZXJjYXNlOyBsaW5lLWhlaWdodDogMTsiPkVudHJlbmFyIE1vZGVsbzwvaDI+CgpgYGB7cn0KbW9kZWxvIDwtIHJhbmRvbUZvcmVzdChDaHVybiB+IC4gLSBDdXN0b21lcklELCBkYXRhID0gZW50cmVuYW1pZW50bywgbnRyZWUgPSAxMDAsIGltcG9ydGFuY2UgPSBUUlVFKQpgYGAKCjxoMiBzdHlsZT0iY29sb3I6IGJsdWU7IGZvbnQtZmFtaWx5OiAnSW1wYWN0JywgJ0FyaWFsIEJsYWNrJywgc2Fucy1zZXJpZjsgZm9udC1zaXplOiA0ZW07IGZvbnQtd2VpZ2h0OiA5MDA7IHRleHQtdHJhbnNmb3JtOiB1cHBlcmNhc2U7IGxpbmUtaGVpZ2h0OiAxOyI+TWF0cml6IGRlIENvbmZ1c2nDs248L2gyPgoKYGBge3J9CnJlc3VsdGFkb19lbnRyZW5hbWllbnRvIDwtIHByZWRpY3QobW9kZWxvLCBlbnRyZW5hbWllbnRvKQpyZXN1bHRhZG9fcHJ1ZWJhIDwtIHByZWRpY3QobW9kZWxvLCBwcnVlYmEpCgptYXRyaXpfZW50cmVuYW1pZW50byA8LSBjb25mdXNpb25NYXRyaXgocmVzdWx0YWRvX2VudHJlbmFtaWVudG8sIGVudHJlbmFtaWVudG8kQ2h1cm4pCm1hdHJpel9wcnVlYmEgPC0gY29uZnVzaW9uTWF0cml4KHJlc3VsdGFkb19wcnVlYmEsIHBydWViYSRDaHVybikKCnByaW50KG1hdHJpel9lbnRyZW5hbWllbnRvKQpwcmludChtYXRyaXpfcHJ1ZWJhKQpgYGAKCjxoMiBzdHlsZT0iY29sb3I6IGJsdWU7IGZvbnQtZmFtaWx5OiAnSW1wYWN0JywgJ0FyaWFsIEJsYWNrJywgc2Fucy1zZXJpZjsgZm9udC1zaXplOiA0ZW07IGZvbnQtd2VpZ2h0OiA5MDA7IHRleHQtdHJhbnNmb3JtOiB1cHBlcmNhc2U7IGxpbmUtaGVpZ2h0OiAxOyI+R3JhZmljYXM8L2gyPgoKYGBge3J9CnBsb3QobW9kZWxvKQp2YXJJbXBQbG90KG1vZGVsbykKYGBg