Teoria

Los Bosques Aleatorios son una técnica de aprendizaje automático que se basa en construir varios árboles de decisión en lugar de uno solo. Cada uno de estos árboles se entrena con una muestra aleatoria de los datos y con diferentes subconjuntos de variables, lo que ayuda a disminuir el sobreajuste. Al final, el modelo combina las predicciones de todos los árboles,ya sea por votación en clasificación o por promedio en regresión, logrando resultados más precisos y estables que los de un solo árbol.

Instalar paquetes y llamar librerias

library(caret) 
library(randomForest) 

Cargar Base de Datos y Entenderla

churn <- read.csv("/Users/nataliamartinez/Desktop/customer_churn.csv")

summary(churn)
##    CustomerID          Age           Gender              Tenure     
##  Min.   :     2   Min.   :18.00   Length:440833      Min.   : 1.00  
##  1st Qu.:113622   1st Qu.:29.00   Class :character   1st Qu.:16.00  
##  Median :226126   Median :39.00   Mode  :character   Median :32.00  
##  Mean   :225399   Mean   :39.37                      Mean   :31.26  
##  3rd Qu.:337739   3rd Qu.:48.00                      3rd Qu.:46.00  
##  Max.   :449999   Max.   :65.00                      Max.   :60.00  
##  NA's   :1        NA's   :1                          NA's   :1      
##  Usage.Frequency Support.Calls    Payment.Delay   Subscription.Type 
##  Min.   : 1.00   Min.   : 0.000   Min.   : 0.00   Length:440833     
##  1st Qu.: 9.00   1st Qu.: 1.000   1st Qu.: 6.00   Class :character  
##  Median :16.00   Median : 3.000   Median :12.00   Mode  :character  
##  Mean   :15.81   Mean   : 3.604   Mean   :12.97                     
##  3rd Qu.:23.00   3rd Qu.: 6.000   3rd Qu.:19.00                     
##  Max.   :30.00   Max.   :10.000   Max.   :30.00                     
##  NA's   :1       NA's   :1        NA's   :1                         
##  Contract.Length     Total.Spend     Last.Interaction     Churn       
##  Length:440833      Min.   : 100.0   Min.   : 1.00    Min.   :0.0000  
##  Class :character   1st Qu.: 480.0   1st Qu.: 7.00    1st Qu.:0.0000  
##  Mode  :character   Median : 661.0   Median :14.00    Median :1.0000  
##                     Mean   : 631.6   Mean   :14.48    Mean   :0.5671  
##                     3rd Qu.: 830.0   3rd Qu.:22.00    3rd Qu.:1.0000  
##                     Max.   :1000.0   Max.   :30.00    Max.   :1.0000  
##                     NA's   :1        NA's   :1        NA's   :1
str(churn)
## 'data.frame':    440833 obs. of  12 variables:
##  $ CustomerID       : int  2 3 4 5 6 8 9 10 11 12 ...
##  $ Age              : int  30 65 55 58 23 51 58 55 39 64 ...
##  $ Gender           : chr  "Female" "Female" "Female" "Male" ...
##  $ Tenure           : int  39 49 14 38 32 33 49 37 12 3 ...
##  $ Usage.Frequency  : int  14 1 4 21 20 25 12 8 5 25 ...
##  $ Support.Calls    : int  5 10 6 7 5 9 3 4 7 2 ...
##  $ Payment.Delay    : int  18 8 18 7 8 26 16 15 4 11 ...
##  $ Subscription.Type: chr  "Standard" "Basic" "Basic" "Standard" ...
##  $ Contract.Length  : chr  "Annual" "Monthly" "Quarterly" "Monthly" ...
##  $ Total.Spend      : num  932 557 185 396 617 129 821 445 969 415 ...
##  $ Last.Interaction : int  17 6 3 29 20 8 24 30 13 29 ...
##  $ Churn            : int  1 1 1 1 1 1 1 1 1 1 ...
head(churn)
##   CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1          2  30 Female     39              14             5            18
## 2          3  65 Female     49               1            10             8
## 3          4  55 Female     14               4             6            18
## 4          5  58   Male     38              21             7             7
## 5          6  23   Male     32              20             5             8
## 6          8  51   Male     33              25             9            26
##   Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1          Standard          Annual         932               17     1
## 2             Basic         Monthly         557                6     1
## 3             Basic       Quarterly         185                3     1
## 4          Standard         Monthly         396               29     1
## 5             Basic         Monthly         617               20     1
## 6           Premium          Annual         129                8     1
churn$Gender <- as.factor(churn$Gender)
churn$Subscription.Type <- as.factor(churn$Subscription.Type)
churn$Contract.Length <- as.factor(churn$Contract.Length)
churn$Churn <- as.factor(churn$Churn)

str(churn)
## 'data.frame':    440833 obs. of  12 variables:
##  $ CustomerID       : int  2 3 4 5 6 8 9 10 11 12 ...
##  $ Age              : int  30 65 55 58 23 51 58 55 39 64 ...
##  $ Gender           : Factor w/ 3 levels "","Female","Male": 2 2 2 3 3 3 2 2 3 2 ...
##  $ Tenure           : int  39 49 14 38 32 33 49 37 12 3 ...
##  $ Usage.Frequency  : int  14 1 4 21 20 25 12 8 5 25 ...
##  $ Support.Calls    : int  5 10 6 7 5 9 3 4 7 2 ...
##  $ Payment.Delay    : int  18 8 18 7 8 26 16 15 4 11 ...
##  $ Subscription.Type: Factor w/ 4 levels "","Basic","Premium",..: 4 2 2 4 2 3 4 3 4 4 ...
##  $ Contract.Length  : Factor w/ 4 levels "","Annual","Monthly",..: 2 3 4 3 3 2 4 2 4 4 ...
##  $ Total.Spend      : num  932 557 185 396 617 129 821 445 969 415 ...
##  $ Last.Interaction : int  17 6 3 29 20 8 24 30 13 29 ...
##  $ Churn            : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
churn <- na.omit(churn)

Partir datos 80/20

set.seed(123)

renglones_entrenamiento <- createDataPartition(churn$Churn, p = 0.8, list = FALSE)

entrenamiento <- churn[renglones_entrenamiento, ]

prueba <- churn[-renglones_entrenamiento, ]

Modelo de Bosques Aleatorios

modelo <- randomForest(Churn ~ . - CustomerID, 
                       data = entrenamiento, 
                       ntree = 100, 
                       importance = TRUE)

Matrices de Confusion

resultado_entrenamiento <- predict(modelo, entrenamiento)

resultado_prueba <- predict(modelo, prueba)

#Matriz de Confusion del Resultado del Entrenamiento

mcre <- confusionMatrix(resultado_entrenamiento, entrenamiento$Churn)

mcre
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 152667      0
##          1      0 200000
##                                    
##                Accuracy : 1        
##                  95% CI : (1, 1)   
##     No Information Rate : 0.5671   
##     P-Value [Acc > NIR] : < 2.2e-16
##                                    
##                   Kappa : 1        
##                                    
##  Mcnemar's Test P-Value : NA       
##                                    
##             Sensitivity : 1.0000   
##             Specificity : 1.0000   
##          Pos Pred Value : 1.0000   
##          Neg Pred Value : 1.0000   
##              Prevalence : 0.4329   
##          Detection Rate : 0.4329   
##    Detection Prevalence : 0.4329   
##       Balanced Accuracy : 1.0000   
##                                    
##        'Positive' Class : 0        
## 

#Matriz de Confusion de la Prueba

mcrp <- confusionMatrix(resultado_prueba, prueba$Churn)
mcrp 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 38165     7
##          1     1 49992
##                                      
##                Accuracy : 0.9999     
##                  95% CI : (0.9998, 1)
##     No Information Rate : 0.5671     
##     P-Value [Acc > NIR] : <2e-16     
##                                      
##                   Kappa : 0.9998     
##                                      
##  Mcnemar's Test P-Value : 0.0771     
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 0.9999     
##          Pos Pred Value : 0.9998     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4329     
##          Detection Rate : 0.4329     
##    Detection Prevalence : 0.4330     
##       Balanced Accuracy : 0.9999     
##                                      
##        'Positive' Class : 0          
## 

Graficar Modelo

plot(modelo) 

varImpPlot(modelo)