Los Bosques Aleatorios son una técnica de aprendizaje automático que se basa en construir varios árboles de decisión en lugar de uno solo. Cada uno de estos árboles se entrena con una muestra aleatoria de los datos y con diferentes subconjuntos de variables, lo que ayuda a disminuir el sobreajuste. Al final, el modelo combina las predicciones de todos los árboles,ya sea por votación en clasificación o por promedio en regresión, logrando resultados más precisos y estables que los de un solo árbol.
library(caret)
library(randomForest)
churn <- read.csv("/Users/nataliamartinez/Desktop/customer_churn.csv")
summary(churn)
## CustomerID Age Gender Tenure
## Min. : 2 Min. :18.00 Length:440833 Min. : 1.00
## 1st Qu.:113622 1st Qu.:29.00 Class :character 1st Qu.:16.00
## Median :226126 Median :39.00 Mode :character Median :32.00
## Mean :225399 Mean :39.37 Mean :31.26
## 3rd Qu.:337739 3rd Qu.:48.00 3rd Qu.:46.00
## Max. :449999 Max. :65.00 Max. :60.00
## NA's :1 NA's :1 NA's :1
## Usage.Frequency Support.Calls Payment.Delay Subscription.Type
## Min. : 1.00 Min. : 0.000 Min. : 0.00 Length:440833
## 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 6.00 Class :character
## Median :16.00 Median : 3.000 Median :12.00 Mode :character
## Mean :15.81 Mean : 3.604 Mean :12.97
## 3rd Qu.:23.00 3rd Qu.: 6.000 3rd Qu.:19.00
## Max. :30.00 Max. :10.000 Max. :30.00
## NA's :1 NA's :1 NA's :1
## Contract.Length Total.Spend Last.Interaction Churn
## Length:440833 Min. : 100.0 Min. : 1.00 Min. :0.0000
## Class :character 1st Qu.: 480.0 1st Qu.: 7.00 1st Qu.:0.0000
## Mode :character Median : 661.0 Median :14.00 Median :1.0000
## Mean : 631.6 Mean :14.48 Mean :0.5671
## 3rd Qu.: 830.0 3rd Qu.:22.00 3rd Qu.:1.0000
## Max. :1000.0 Max. :30.00 Max. :1.0000
## NA's :1 NA's :1 NA's :1
str(churn)
## 'data.frame': 440833 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : chr "Female" "Female" "Female" "Male" ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: chr "Standard" "Basic" "Basic" "Standard" ...
## $ Contract.Length : chr "Annual" "Monthly" "Quarterly" "Monthly" ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : int 1 1 1 1 1 1 1 1 1 1 ...
head(churn)
## CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1 2 30 Female 39 14 5 18
## 2 3 65 Female 49 1 10 8
## 3 4 55 Female 14 4 6 18
## 4 5 58 Male 38 21 7 7
## 5 6 23 Male 32 20 5 8
## 6 8 51 Male 33 25 9 26
## Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1 Standard Annual 932 17 1
## 2 Basic Monthly 557 6 1
## 3 Basic Quarterly 185 3 1
## 4 Standard Monthly 396 29 1
## 5 Basic Monthly 617 20 1
## 6 Premium Annual 129 8 1
churn$Gender <- as.factor(churn$Gender)
churn$Subscription.Type <- as.factor(churn$Subscription.Type)
churn$Contract.Length <- as.factor(churn$Contract.Length)
churn$Churn <- as.factor(churn$Churn)
str(churn)
## 'data.frame': 440833 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : Factor w/ 3 levels "","Female","Male": 2 2 2 3 3 3 2 2 3 2 ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: Factor w/ 4 levels "","Basic","Premium",..: 4 2 2 4 2 3 4 3 4 4 ...
## $ Contract.Length : Factor w/ 4 levels "","Annual","Monthly",..: 2 3 4 3 3 2 4 2 4 4 ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
churn <- na.omit(churn)
set.seed(123)
renglones_entrenamiento <- createDataPartition(churn$Churn, p = 0.8, list = FALSE)
entrenamiento <- churn[renglones_entrenamiento, ]
prueba <- churn[-renglones_entrenamiento, ]
modelo <- randomForest(Churn ~ . - CustomerID,
data = entrenamiento,
ntree = 100,
importance = TRUE)
resultado_entrenamiento <- predict(modelo, entrenamiento)
resultado_prueba <- predict(modelo, prueba)
#Matriz de Confusion del Resultado del Entrenamiento
mcre <- confusionMatrix(resultado_entrenamiento, entrenamiento$Churn)
mcre
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 152667 0
## 1 0 200000
##
## Accuracy : 1
## 95% CI : (1, 1)
## No Information Rate : 0.5671
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4329
## Detection Rate : 0.4329
## Detection Prevalence : 0.4329
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
#Matriz de Confusion de la Prueba
mcrp <- confusionMatrix(resultado_prueba, prueba$Churn)
mcrp
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38165 7
## 1 1 49992
##
## Accuracy : 0.9999
## 95% CI : (0.9998, 1)
## No Information Rate : 0.5671
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9998
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 1.0000
## Specificity : 0.9999
## Pos Pred Value : 0.9998
## Neg Pred Value : 1.0000
## Prevalence : 0.4329
## Detection Rate : 0.4329
## Detection Prevalence : 0.4330
## Balanced Accuracy : 0.9999
##
## 'Positive' Class : 0
##
plot(modelo)
varImpPlot(modelo)