Teoría

Un bosque aleatorio es un conjunto (ensamble) de muchos árboles de decisión entrenados sobre diferentes subconjuntos aleatorios de los datos y variables.

Instalar paquetes y llamar librerias

#install.packages("caret")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
#install.packages("randomForest")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin

Cargar la base de datos

#file.choose()

churn <- read.csv("/Users/eduardojuniormedinahernandez/Downloads/customer_churn.csv")

Entender la base de datos

summary(churn)
##    CustomerID          Age           Gender              Tenure     
##  Min.   :     2   Min.   :18.00   Length:440833      Min.   : 1.00  
##  1st Qu.:113622   1st Qu.:29.00   Class :character   1st Qu.:16.00  
##  Median :226126   Median :39.00   Mode  :character   Median :32.00  
##  Mean   :225399   Mean   :39.37                      Mean   :31.26  
##  3rd Qu.:337739   3rd Qu.:48.00                      3rd Qu.:46.00  
##  Max.   :449999   Max.   :65.00                      Max.   :60.00  
##  NA's   :1        NA's   :1                          NA's   :1      
##  Usage.Frequency Support.Calls    Payment.Delay   Subscription.Type 
##  Min.   : 1.00   Min.   : 0.000   Min.   : 0.00   Length:440833     
##  1st Qu.: 9.00   1st Qu.: 1.000   1st Qu.: 6.00   Class :character  
##  Median :16.00   Median : 3.000   Median :12.00   Mode  :character  
##  Mean   :15.81   Mean   : 3.604   Mean   :12.97                     
##  3rd Qu.:23.00   3rd Qu.: 6.000   3rd Qu.:19.00                     
##  Max.   :30.00   Max.   :10.000   Max.   :30.00                     
##  NA's   :1       NA's   :1        NA's   :1                         
##  Contract.Length     Total.Spend     Last.Interaction     Churn       
##  Length:440833      Min.   : 100.0   Min.   : 1.00    Min.   :0.0000  
##  Class :character   1st Qu.: 480.0   1st Qu.: 7.00    1st Qu.:0.0000  
##  Mode  :character   Median : 661.0   Median :14.00    Median :1.0000  
##                     Mean   : 631.6   Mean   :14.48    Mean   :0.5671  
##                     3rd Qu.: 830.0   3rd Qu.:22.00    3rd Qu.:1.0000  
##                     Max.   :1000.0   Max.   :30.00    Max.   :1.0000  
##                     NA's   :1        NA's   :1        NA's   :1
str(churn)
## 'data.frame':    440833 obs. of  12 variables:
##  $ CustomerID       : int  2 3 4 5 6 8 9 10 11 12 ...
##  $ Age              : int  30 65 55 58 23 51 58 55 39 64 ...
##  $ Gender           : chr  "Female" "Female" "Female" "Male" ...
##  $ Tenure           : int  39 49 14 38 32 33 49 37 12 3 ...
##  $ Usage.Frequency  : int  14 1 4 21 20 25 12 8 5 25 ...
##  $ Support.Calls    : int  5 10 6 7 5 9 3 4 7 2 ...
##  $ Payment.Delay    : int  18 8 18 7 8 26 16 15 4 11 ...
##  $ Subscription.Type: chr  "Standard" "Basic" "Basic" "Standard" ...
##  $ Contract.Length  : chr  "Annual" "Monthly" "Quarterly" "Monthly" ...
##  $ Total.Spend      : num  932 557 185 396 617 129 821 445 969 415 ...
##  $ Last.Interaction : int  17 6 3 29 20 8 24 30 13 29 ...
##  $ Churn            : int  1 1 1 1 1 1 1 1 1 1 ...
head(churn)
##   CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1          2  30 Female     39              14             5            18
## 2          3  65 Female     49               1            10             8
## 3          4  55 Female     14               4             6            18
## 4          5  58   Male     38              21             7             7
## 5          6  23   Male     32              20             5             8
## 6          8  51   Male     33              25             9            26
##   Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1          Standard          Annual         932               17     1
## 2             Basic         Monthly         557                6     1
## 3             Basic       Quarterly         185                3     1
## 4          Standard         Monthly         396               29     1
## 5             Basic         Monthly         617               20     1
## 6           Premium          Annual         129                8     1
churn$Gender <-as.factor(churn$Gender)
churn$Churn <-as.factor(churn$Churn)
churn$Subscription.Type <-as.factor(churn$Subscription.Type)
churn$Contract.Length <-as.factor(churn$Contract.Length)

churn <- na.omit(churn)

Partir datos 80-20

set.seed(123)
reglones_entrenamiento <- createDataPartition(churn$Churn, p=0.8, list=FALSE)
entrenamiento <- churn[reglones_entrenamiento, ]
prueba <- churn[-reglones_entrenamiento, ]

Modelos de Bosques Aleatorios

modelo <- randomForest(Churn~. -CustomerID,
                data= entrenamiento,
                ntree= 100,
                importance=TRUE)

Matrices de confusión

resultado_entrenamiento <- predict(modelo,entrenamiento)
resultado_prueba <- predict(modelo, prueba)

#Matriz de confusión del resultado de entrenamiento 
mcre <- confusionMatrix(resultado_entrenamiento, entrenamiento$Churn)
mcre
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 152667      0
##          1      0 200000
##                                    
##                Accuracy : 1        
##                  95% CI : (1, 1)   
##     No Information Rate : 0.5671   
##     P-Value [Acc > NIR] : < 2.2e-16
##                                    
##                   Kappa : 1        
##                                    
##  Mcnemar's Test P-Value : NA       
##                                    
##             Sensitivity : 1.0000   
##             Specificity : 1.0000   
##          Pos Pred Value : 1.0000   
##          Neg Pred Value : 1.0000   
##              Prevalence : 0.4329   
##          Detection Rate : 0.4329   
##    Detection Prevalence : 0.4329   
##       Balanced Accuracy : 1.0000   
##                                    
##        'Positive' Class : 0        
## 
#Matriz de confusion 
mcrp <-confusionMatrix(resultado_prueba,prueba$Churn)
mcrp
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 38165     7
##          1     1 49992
##                                      
##                Accuracy : 0.9999     
##                  95% CI : (0.9998, 1)
##     No Information Rate : 0.5671     
##     P-Value [Acc > NIR] : <2e-16     
##                                      
##                   Kappa : 0.9998     
##                                      
##  Mcnemar's Test P-Value : 0.0771     
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 0.9999     
##          Pos Pred Value : 0.9998     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4329     
##          Detection Rate : 0.4329     
##    Detection Prevalence : 0.4330     
##       Balanced Accuracy : 0.9999     
##                                      
##        'Positive' Class : 0          
## 

Resultados

plot(modelo)

varImpPlot(modelo)

Conclusiones

1.El modelo identifica a Suscription como la variable más influyente en la predicción, seguida por Gender y Contract.Length, lo que indica que las características relacionadas con el tipo de cliente y las condiciones del contrato son determinantes clave.

2.Variables como Age, Payment.Delay, Support.Calls y Total.Spend presentan una importancia intermedia, contribuyendo de manera significativa pero no predominante al desempeño del modelo.

3.Tenure, Last.Interaction, Usage.Frequency y Subscription.Type muestran una baja relevancia predictiva, por lo que su impacto en la precisión del modelo es limitado.

LS0tCnRpdGxlOiAiQm9zcXVlcyBBbGVhdG9yaW9zIC0gY3VzdG9tZXIgY2h1cm4iCmF1dGhvcjogIkp1bmlvciBNZWRpbmEiCmRhdGU6ICIyMDI2LTAyLTI1IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICAgdG9jOiBUUlVFCiAgICAgdG9jX2Zsb2F0OiBUUlVFCiAgICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgIHRoZW1lOiBmbGF0bHkKLS0tCgohW10oaHR0cHM6Ly9taXJvLm1lZGl1bS5jb20vMCpaNTJOYmtVM1VkZHEzZ1g3LnBuZykKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWUiPiBUZW9yw61hIDwvc3Bhbj4KClVuIGJvc3F1ZSBhbGVhdG9yaW8gZXMgdW4gY29uanVudG8gKGVuc2FtYmxlKSBkZSBtdWNob3Mgw6FyYm9sZXMgZGUgZGVjaXNpw7NuIGVudHJlbmFkb3Mgc29icmUgZGlmZXJlbnRlcyBzdWJjb25qdW50b3MgYWxlYXRvcmlvcyBkZSBsb3MgZGF0b3MgeSB2YXJpYWJsZXMuCgoKIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZSI+IEluc3RhbGFyIHBhcXVldGVzIHkgbGxhbWFyIGxpYnJlcmlhcyA8L3NwYW4+CgpgYGB7cn0KI2luc3RhbGwucGFja2FnZXMoImNhcmV0IikKbGlicmFyeShjYXJldCkKI2luc3RhbGwucGFja2FnZXMoInJhbmRvbUZvcmVzdCIpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpgYGAKIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZSI+IENhcmdhciBsYSBiYXNlIGRlIGRhdG9zIDwvc3Bhbj4KYGBge3J9CiNmaWxlLmNob29zZSgpCgpjaHVybiA8LSByZWFkLmNzdigiL1VzZXJzL2VkdWFyZG9qdW5pb3JtZWRpbmFoZXJuYW5kZXovRG93bmxvYWRzL2N1c3RvbWVyX2NodXJuLmNzdiIpCgoKYGBgCgojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlIj4gRW50ZW5kZXIgbGEgYmFzZSBkZSBkYXRvcyA8L3NwYW4+CmBgYHtyfQpzdW1tYXJ5KGNodXJuKQpzdHIoY2h1cm4pCmhlYWQoY2h1cm4pCmNodXJuJEdlbmRlciA8LWFzLmZhY3RvcihjaHVybiRHZW5kZXIpCmNodXJuJENodXJuIDwtYXMuZmFjdG9yKGNodXJuJENodXJuKQpjaHVybiRTdWJzY3JpcHRpb24uVHlwZSA8LWFzLmZhY3RvcihjaHVybiRTdWJzY3JpcHRpb24uVHlwZSkKY2h1cm4kQ29udHJhY3QuTGVuZ3RoIDwtYXMuZmFjdG9yKGNodXJuJENvbnRyYWN0Lkxlbmd0aCkKCmNodXJuIDwtIG5hLm9taXQoY2h1cm4pCmBgYAojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlIj5QYXJ0aXIgZGF0b3MgODAtMjA8L3NwYW4+CgpgYGB7cn0Kc2V0LnNlZWQoMTIzKQpyZWdsb25lc19lbnRyZW5hbWllbnRvIDwtIGNyZWF0ZURhdGFQYXJ0aXRpb24oY2h1cm4kQ2h1cm4sIHA9MC44LCBsaXN0PUZBTFNFKQplbnRyZW5hbWllbnRvIDwtIGNodXJuW3JlZ2xvbmVzX2VudHJlbmFtaWVudG8sIF0KcHJ1ZWJhIDwtIGNodXJuWy1yZWdsb25lc19lbnRyZW5hbWllbnRvLCBdCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZSI+TW9kZWxvcyBkZSBCb3NxdWVzIEFsZWF0b3Jpb3MgPC9zcGFuPgoKYGBge3J9Cm1vZGVsbyA8LSByYW5kb21Gb3Jlc3QoQ2h1cm5+LiAtQ3VzdG9tZXJJRCwKICAgICAgICAgICAgICAgIGRhdGE9IGVudHJlbmFtaWVudG8sCiAgICAgICAgICAgICAgICBudHJlZT0gMTAwLAogICAgICAgICAgICAgICAgaW1wb3J0YW5jZT1UUlVFKQoKCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZSI+TWF0cmljZXMgZGUgY29uZnVzacOzbjwvc3Bhbj4KCmBgYHtyfQpyZXN1bHRhZG9fZW50cmVuYW1pZW50byA8LSBwcmVkaWN0KG1vZGVsbyxlbnRyZW5hbWllbnRvKQpyZXN1bHRhZG9fcHJ1ZWJhIDwtIHByZWRpY3QobW9kZWxvLCBwcnVlYmEpCgojTWF0cml6IGRlIGNvbmZ1c2nDs24gZGVsIHJlc3VsdGFkbyBkZSBlbnRyZW5hbWllbnRvIAptY3JlIDwtIGNvbmZ1c2lvbk1hdHJpeChyZXN1bHRhZG9fZW50cmVuYW1pZW50bywgZW50cmVuYW1pZW50byRDaHVybikKbWNyZQojTWF0cml6IGRlIGNvbmZ1c2lvbiAKbWNycCA8LWNvbmZ1c2lvbk1hdHJpeChyZXN1bHRhZG9fcHJ1ZWJhLHBydWViYSRDaHVybikKbWNycApgYGAKIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZSI+UmVzdWx0YWRvczwvc3Bhbj4KCmBgYHtyfQpwbG90KG1vZGVsbykKdmFySW1wUGxvdChtb2RlbG8pCmBgYAoKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWUiPkNvbmNsdXNpb25lczwvc3Bhbj4KCjEuRWwgbW9kZWxvIGlkZW50aWZpY2EgYSBTdXNjcmlwdGlvbiBjb21vIGxhIHZhcmlhYmxlIG3DoXMgaW5mbHV5ZW50ZSBlbiBsYSBwcmVkaWNjacOzbiwgc2VndWlkYSBwb3IgR2VuZGVyIHkgQ29udHJhY3QuTGVuZ3RoLCBsbyBxdWUgaW5kaWNhIHF1ZSBsYXMgY2FyYWN0ZXLDrXN0aWNhcyByZWxhY2lvbmFkYXMgY29uIGVsIHRpcG8gZGUgY2xpZW50ZSB5IGxhcyBjb25kaWNpb25lcyBkZWwgY29udHJhdG8gc29uIGRldGVybWluYW50ZXMgY2xhdmUuCgoyLlZhcmlhYmxlcyBjb21vIEFnZSwgUGF5bWVudC5EZWxheSwgU3VwcG9ydC5DYWxscyB5IFRvdGFsLlNwZW5kIHByZXNlbnRhbiB1bmEgaW1wb3J0YW5jaWEgaW50ZXJtZWRpYSwgY29udHJpYnV5ZW5kbyBkZSBtYW5lcmEgc2lnbmlmaWNhdGl2YSBwZXJvIG5vIHByZWRvbWluYW50ZSBhbCBkZXNlbXBlw7FvIGRlbCBtb2RlbG8uCgozLlRlbnVyZSwgTGFzdC5JbnRlcmFjdGlvbiwgVXNhZ2UuRnJlcXVlbmN5IHkgU3Vic2NyaXB0aW9uLlR5cGUgbXVlc3RyYW4gdW5hIGJhamEgcmVsZXZhbmNpYSBwcmVkaWN0aXZhLCBwb3IgbG8gcXVlIHN1IGltcGFjdG8gZW4gbGEgcHJlY2lzacOzbiBkZWwgbW9kZWxvIGVzIGxpbWl0YWRvLgoKCgo=