El Random Forest es un algoritmo de machine learning, que opera en varios modelos, en lugar de confiar en un único modelo, genera un conjunto de árboles de decisión independientes.
Para evitar el sobreajuste y garantizar diversidad, cada árbol se construye utilizando solo una parte aleatoria de los datos y de las variables disponibles.
library(caret)
library(randomForest)
churn <- read.csv("C:\\Users\\Emili\\Downloads\\customer_churn.csv")
summary(churn)
## CustomerID Age Gender Tenure
## Min. : 2 Min. :18.00 Length:440833 Min. : 1.00
## 1st Qu.:113622 1st Qu.:29.00 Class :character 1st Qu.:16.00
## Median :226126 Median :39.00 Mode :character Median :32.00
## Mean :225399 Mean :39.37 Mean :31.26
## 3rd Qu.:337739 3rd Qu.:48.00 3rd Qu.:46.00
## Max. :449999 Max. :65.00 Max. :60.00
## NA's :1 NA's :1 NA's :1
## Usage.Frequency Support.Calls Payment.Delay Subscription.Type
## Min. : 1.00 Min. : 0.000 Min. : 0.00 Length:440833
## 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 6.00 Class :character
## Median :16.00 Median : 3.000 Median :12.00 Mode :character
## Mean :15.81 Mean : 3.604 Mean :12.97
## 3rd Qu.:23.00 3rd Qu.: 6.000 3rd Qu.:19.00
## Max. :30.00 Max. :10.000 Max. :30.00
## NA's :1 NA's :1 NA's :1
## Contract.Length Total.Spend Last.Interaction Churn
## Length:440833 Min. : 100.0 Min. : 1.00 Min. :0.0000
## Class :character 1st Qu.: 480.0 1st Qu.: 7.00 1st Qu.:0.0000
## Mode :character Median : 661.0 Median :14.00 Median :1.0000
## Mean : 631.6 Mean :14.48 Mean :0.5671
## 3rd Qu.: 830.0 3rd Qu.:22.00 3rd Qu.:1.0000
## Max. :1000.0 Max. :30.00 Max. :1.0000
## NA's :1 NA's :1 NA's :1
str(churn)
## 'data.frame': 440833 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : chr "Female" "Female" "Female" "Male" ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: chr "Standard" "Basic" "Basic" "Standard" ...
## $ Contract.Length : chr "Annual" "Monthly" "Quarterly" "Monthly" ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : int 1 1 1 1 1 1 1 1 1 1 ...
head(churn)
## CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1 2 30 Female 39 14 5 18
## 2 3 65 Female 49 1 10 8
## 3 4 55 Female 14 4 6 18
## 4 5 58 Male 38 21 7 7
## 5 6 23 Male 32 20 5 8
## 6 8 51 Male 33 25 9 26
## Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1 Standard Annual 932 17 1
## 2 Basic Monthly 557 6 1
## 3 Basic Quarterly 185 3 1
## 4 Standard Monthly 396 29 1
## 5 Basic Monthly 617 20 1
## 6 Premium Annual 129 8 1
churn$Gender <- as.factor(churn$Gender)
churn$Subscription.Type <- as.factor(churn$Subscription.Type)
churn$Contract.Length <- as.factor(churn$Contract.Length)
churn$Churn <- as.factor(churn$Churn)
str(churn)
## 'data.frame': 440833 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : Factor w/ 3 levels "","Female","Male": 2 2 2 3 3 3 2 2 3 2 ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: Factor w/ 4 levels "","Basic","Premium",..: 4 2 2 4 2 3 4 3 4 4 ...
## $ Contract.Length : Factor w/ 4 levels "","Annual","Monthly",..: 2 3 4 3 3 2 4 2 4 4 ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
churn<- na.omit(churn)
set.seed(123)
renglones_entrenamiento <- createDataPartition(churn$Churn, p=0.8, list=FALSE)
entrenamiento <- churn[renglones_entrenamiento, ]
prueba <- churn[-renglones_entrenamiento, ]
modelo <- randomForest(Churn~ . -CustomerID,
data = entrenamiento,
ntree= 100,
importance= TRUE)
resultado_entrenamiento <- predict(modelo,entrenamiento)
resultado_prueba <- predict(modelo,prueba)
#Matriz de Confusion del Resultado del Entrenamiento
mcre <- confusionMatrix(resultado_entrenamiento,entrenamiento$Churn)
mcre
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 152667 0
## 1 0 200000
##
## Accuracy : 1
## 95% CI : (1, 1)
## No Information Rate : 0.5671
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4329
## Detection Rate : 0.4329
## Detection Prevalence : 0.4329
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
#Matriz de Confusion de la Prueba
mcrp <- confusionMatrix(resultado_prueba, prueba$Churn)
mcrp
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38165 7
## 1 1 49992
##
## Accuracy : 0.9999
## 95% CI : (0.9998, 1)
## No Information Rate : 0.5671
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9998
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 1.0000
## Specificity : 0.9999
## Pos Pred Value : 0.9998
## Neg Pred Value : 1.0000
## Prevalence : 0.4329
## Detection Rate : 0.4329
## Detection Prevalence : 0.4330
## Balanced Accuracy : 0.9999
##
## 'Positive' Class : 0
##
plot(modelo)
varImpPlot(modelo)