Teoria

Random Forest es un método de aprendizaje supervisado que se basa en un conjunto de múltiples árboles de decisión. Pero en lugar de trabajar de manera independiente, colaboran unos con otros para que las predicciones que realizan sean más precisas. Se le conoce también como bosque aleatorio. # Instalar paquetes y llamar librerías

# install.packages("caret")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# install.packages("randomForest")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(DataExplorer)

Importar la base de datos

churn <- read.csv("/Users/erickcaballero/Downloads/customer_churn.csv")

Entender la base de datos

summary(churn)
##    CustomerID          Age           Gender              Tenure     
##  Min.   :     2   Min.   :18.00   Length:440833      Min.   : 1.00  
##  1st Qu.:113622   1st Qu.:29.00   Class :character   1st Qu.:16.00  
##  Median :226126   Median :39.00   Mode  :character   Median :32.00  
##  Mean   :225399   Mean   :39.37                      Mean   :31.26  
##  3rd Qu.:337739   3rd Qu.:48.00                      3rd Qu.:46.00  
##  Max.   :449999   Max.   :65.00                      Max.   :60.00  
##  NA's   :1        NA's   :1                          NA's   :1      
##  Usage.Frequency Support.Calls    Payment.Delay   Subscription.Type 
##  Min.   : 1.00   Min.   : 0.000   Min.   : 0.00   Length:440833     
##  1st Qu.: 9.00   1st Qu.: 1.000   1st Qu.: 6.00   Class :character  
##  Median :16.00   Median : 3.000   Median :12.00   Mode  :character  
##  Mean   :15.81   Mean   : 3.604   Mean   :12.97                     
##  3rd Qu.:23.00   3rd Qu.: 6.000   3rd Qu.:19.00                     
##  Max.   :30.00   Max.   :10.000   Max.   :30.00                     
##  NA's   :1       NA's   :1        NA's   :1                         
##  Contract.Length     Total.Spend     Last.Interaction     Churn       
##  Length:440833      Min.   : 100.0   Min.   : 1.00    Min.   :0.0000  
##  Class :character   1st Qu.: 480.0   1st Qu.: 7.00    1st Qu.:0.0000  
##  Mode  :character   Median : 661.0   Median :14.00    Median :1.0000  
##                     Mean   : 631.6   Mean   :14.48    Mean   :0.5671  
##                     3rd Qu.: 830.0   3rd Qu.:22.00    3rd Qu.:1.0000  
##                     Max.   :1000.0   Max.   :30.00    Max.   :1.0000  
##                     NA's   :1        NA's   :1        NA's   :1
str(churn)
## 'data.frame':    440833 obs. of  12 variables:
##  $ CustomerID       : int  2 3 4 5 6 8 9 10 11 12 ...
##  $ Age              : int  30 65 55 58 23 51 58 55 39 64 ...
##  $ Gender           : chr  "Female" "Female" "Female" "Male" ...
##  $ Tenure           : int  39 49 14 38 32 33 49 37 12 3 ...
##  $ Usage.Frequency  : int  14 1 4 21 20 25 12 8 5 25 ...
##  $ Support.Calls    : int  5 10 6 7 5 9 3 4 7 2 ...
##  $ Payment.Delay    : int  18 8 18 7 8 26 16 15 4 11 ...
##  $ Subscription.Type: chr  "Standard" "Basic" "Basic" "Standard" ...
##  $ Contract.Length  : chr  "Annual" "Monthly" "Quarterly" "Monthly" ...
##  $ Total.Spend      : num  932 557 185 396 617 129 821 445 969 415 ...
##  $ Last.Interaction : int  17 6 3 29 20 8 24 30 13 29 ...
##  $ Churn            : int  1 1 1 1 1 1 1 1 1 1 ...
head(churn)
##   CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1          2  30 Female     39              14             5            18
## 2          3  65 Female     49               1            10             8
## 3          4  55 Female     14               4             6            18
## 4          5  58   Male     38              21             7             7
## 5          6  23   Male     32              20             5             8
## 6          8  51   Male     33              25             9            26
##   Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1          Standard          Annual         932               17     1
## 2             Basic         Monthly         557                6     1
## 3             Basic       Quarterly         185                3     1
## 4          Standard         Monthly         396               29     1
## 5             Basic         Monthly         617               20     1
## 6           Premium          Annual         129                8     1
churn$Churn <- as.factor(churn$Churn)
churn$Gender <- as.factor(churn$Gender)
churn$Subscription.Type <- as.factor(churn$Subscription.Type)
churn$Contract.Length <- as.factor(churn$Contract.Length)

churn <- na.omit(churn)

Partir datos 80/20

set.seed(123)
renglones_entrenamiento <- createDataPartition(churn$Churn, p=0.8, list=FALSE)
entrenamiento <- churn[renglones_entrenamiento, ]
prueba <- churn[-renglones_entrenamiento, ]

Modelo de Random Forests

modelo <- randomForest(Churn~ . -CustomerID,
                       data = entrenamiento,
                       ntree = 100,
                       importance=TRUE)

Matriz de confusión

resultado_entrenamiento <- predict(modelo,entrenamiento)
resultado_prueba <- predict(modelo, prueba)

# Matriz de confusión (resultado entrenamiento)
mcre <- confusionMatrix(resultado_entrenamiento, entrenamiento$Churn )
mcre
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      0      1
##          0 152667      0
##          1      0 200000
##                                    
##                Accuracy : 1        
##                  95% CI : (1, 1)   
##     No Information Rate : 0.5671   
##     P-Value [Acc > NIR] : < 2.2e-16
##                                    
##                   Kappa : 1        
##                                    
##  Mcnemar's Test P-Value : NA       
##                                    
##             Sensitivity : 1.0000   
##             Specificity : 1.0000   
##          Pos Pred Value : 1.0000   
##          Neg Pred Value : 1.0000   
##              Prevalence : 0.4329   
##          Detection Rate : 0.4329   
##    Detection Prevalence : 0.4329   
##       Balanced Accuracy : 1.0000   
##                                    
##        'Positive' Class : 0        
## 
# Matriz de Confusión del Resultado de la prueba 
mcrp <- confusionMatrix(resultado_prueba, prueba$Churn)
mcrp
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 38165     7
##          1     1 49992
##                                      
##                Accuracy : 0.9999     
##                  95% CI : (0.9998, 1)
##     No Information Rate : 0.5671     
##     P-Value [Acc > NIR] : <2e-16     
##                                      
##                   Kappa : 0.9998     
##                                      
##  Mcnemar's Test P-Value : 0.0771     
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 0.9999     
##          Pos Pred Value : 0.9998     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4329     
##          Detection Rate : 0.4329     
##    Detection Prevalence : 0.4330     
##       Balanced Accuracy : 0.9999     
##                                      
##        'Positive' Class : 0          
## 

Resultados

plot(modelo)

varImpPlot(modelo)

LS0tCnRpdGxlOiAiUmFuZG9tIEZvcmVzdCAtIEN1c3RvbWVyIENodXJuIgphdXRob3I6ICJFcmljayBDYWJhbGxlcm8gTMOzcGV6IEEwMDgzODA2MSIKZGF0ZTogIjIwMjYtMDItMjUiCm91dHB1dDogCiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogVFJVRQogICAgdG9jX2Zsb2F0OiBUUlVFCiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICB0aGVtZTogZmxhdGx5Ci0tLQoKCiFbXShodHRwczovL21lZGlhLmxpY2RuLmNvbS9kbXMvaW1hZ2UvdjIvRDREMTJBUUVxOGZ3RThvQkxpUS9hcnRpY2xlLWNvdmVyX2ltYWdlLXNocmlua182MDBfMjAwMC9hcnRpY2xlLWNvdmVyX2ltYWdlLXNocmlua182MDBfMjAwMC8wLzE3MDU1MjY2NDczNDY/ZT0yMTQ3NDgzNjQ3JnY9YmV0YSZ0PW95U3QybHotZ1pFZGUyV0pRWU9Fdk1oeXpEMHR1bWw5RS1Cbm9iSlRZV0kpCgojIDxzcGFuIHN0eWxlPSJjb2xvcjpncmVlbiI+IFRlb3JpYSA8L3NwYW4+ClJhbmRvbSBGb3Jlc3QgZXMgdW4gbcOpdG9kbyBkZSBhcHJlbmRpemFqZSBzdXBlcnZpc2FkbyBxdWUgc2UgYmFzYSBlbiB1biBjb25qdW50byBkZSBtw7psdGlwbGVzIMOhcmJvbGVzIGRlIGRlY2lzacOzbi4gUGVybyBlbiBsdWdhciBkZSB0cmFiYWphciBkZSBtYW5lcmEgaW5kZXBlbmRpZW50ZSwgY29sYWJvcmFuIHVub3MgY29uIG90cm9zIHBhcmEgcXVlIGxhcyBwcmVkaWNjaW9uZXMgcXVlIHJlYWxpemFuIHNlYW4gbcOhcyBwcmVjaXNhcy4gU2UgbGUgY29ub2NlIHRhbWJpw6luIGNvbW8gYm9zcXVlIGFsZWF0b3Jpby4KIyA8c3BhbiBzdHlsZT0iY29sb3I6Z3JlZW4iPiBJbnN0YWxhciBwYXF1ZXRlcyB5IGxsYW1hciBsaWJyZXLDrWFzIDwvc3Bhbj4KYGBge3J9CiMgaW5zdGFsbC5wYWNrYWdlcygiY2FyZXQiKQpsaWJyYXJ5KGNhcmV0KQojIGluc3RhbGwucGFja2FnZXMoInJhbmRvbUZvcmVzdCIpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpsaWJyYXJ5KERhdGFFeHBsb3JlcikKYGBgCgojIDxzcGFuIHN0eWxlPSJjb2xvcjpncmVlbiI+IEltcG9ydGFyIGxhIGJhc2UgZGUgZGF0b3MgIDwvc3Bhbj4KYGBge3J9CmNodXJuIDwtIHJlYWQuY3N2KCIvVXNlcnMvZXJpY2tjYWJhbGxlcm8vRG93bmxvYWRzL2N1c3RvbWVyX2NodXJuLmNzdiIpCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6Z3JlZW4iPiBFbnRlbmRlciBsYSBiYXNlIGRlIGRhdG9zIDwvc3Bhbj4KYGBge3J9CnN1bW1hcnkoY2h1cm4pCnN0cihjaHVybikKaGVhZChjaHVybikKCmNodXJuJENodXJuIDwtIGFzLmZhY3RvcihjaHVybiRDaHVybikKY2h1cm4kR2VuZGVyIDwtIGFzLmZhY3RvcihjaHVybiRHZW5kZXIpCmNodXJuJFN1YnNjcmlwdGlvbi5UeXBlIDwtIGFzLmZhY3RvcihjaHVybiRTdWJzY3JpcHRpb24uVHlwZSkKY2h1cm4kQ29udHJhY3QuTGVuZ3RoIDwtIGFzLmZhY3RvcihjaHVybiRDb250cmFjdC5MZW5ndGgpCgpjaHVybiA8LSBuYS5vbWl0KGNodXJuKQoKYGBgCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmdyZWVuIj4gUGFydGlyIGRhdG9zIDgwLzIwIDwvc3Bhbj4KYGBge3J9CnNldC5zZWVkKDEyMykKcmVuZ2xvbmVzX2VudHJlbmFtaWVudG8gPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihjaHVybiRDaHVybiwgcD0wLjgsIGxpc3Q9RkFMU0UpCmVudHJlbmFtaWVudG8gPC0gY2h1cm5bcmVuZ2xvbmVzX2VudHJlbmFtaWVudG8sIF0KcHJ1ZWJhIDwtIGNodXJuWy1yZW5nbG9uZXNfZW50cmVuYW1pZW50bywgXQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmdyZWVuIj4gTW9kZWxvIGRlIFJhbmRvbSBGb3Jlc3RzIDwvc3Bhbj4KYGBge3J9Cm1vZGVsbyA8LSByYW5kb21Gb3Jlc3QoQ2h1cm5+IC4gLUN1c3RvbWVySUQsCiAgICAgICAgICAgICAgICAgICAgICAgZGF0YSA9IGVudHJlbmFtaWVudG8sCiAgICAgICAgICAgICAgICAgICAgICAgbnRyZWUgPSAxMDAsCiAgICAgICAgICAgICAgICAgICAgICAgaW1wb3J0YW5jZT1UUlVFKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmdyZWVuIj4gTWF0cml6IGRlIGNvbmZ1c2nDs24gPC9zcGFuPgpgYGB7cn0KcmVzdWx0YWRvX2VudHJlbmFtaWVudG8gPC0gcHJlZGljdChtb2RlbG8sZW50cmVuYW1pZW50bykKcmVzdWx0YWRvX3BydWViYSA8LSBwcmVkaWN0KG1vZGVsbywgcHJ1ZWJhKQoKIyBNYXRyaXogZGUgY29uZnVzacOzbiAocmVzdWx0YWRvIGVudHJlbmFtaWVudG8pCm1jcmUgPC0gY29uZnVzaW9uTWF0cml4KHJlc3VsdGFkb19lbnRyZW5hbWllbnRvLCBlbnRyZW5hbWllbnRvJENodXJuICkKbWNyZQojIE1hdHJpeiBkZSBDb25mdXNpw7NuIGRlbCBSZXN1bHRhZG8gZGUgbGEgcHJ1ZWJhIAptY3JwIDwtIGNvbmZ1c2lvbk1hdHJpeChyZXN1bHRhZG9fcHJ1ZWJhLCBwcnVlYmEkQ2h1cm4pCm1jcnAKYGBgCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmdyZWVuIj4gUmVzdWx0YWRvcyA8L3NwYW4+CmBgYHtyfQpwbG90KG1vZGVsbykKdmFySW1wUGxvdChtb2RlbG8pCmBgYAoK