Instalar paquetes y llamar librerias
#install.packages("caret")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
#install.packages("randomForest")
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
Entender la base de datos
#file.choose()
churn <- read.csv("/Users/giuliana/Downloads/customer_churn.csv")
head(churn)
## CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1 2 30 Female 39 14 5 18
## 2 3 65 Female 49 1 10 8
## 3 4 55 Female 14 4 6 18
## 4 5 58 Male 38 21 7 7
## 5 6 23 Male 32 20 5 8
## 6 8 51 Male 33 25 9 26
## Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1 Standard Annual 932 17 1
## 2 Basic Monthly 557 6 1
## 3 Basic Quarterly 185 3 1
## 4 Standard Monthly 396 29 1
## 5 Basic Monthly 617 20 1
## 6 Premium Annual 129 8 1
summary(churn)
## CustomerID Age Gender Tenure
## Min. : 2 Min. :18.00 Length:440833 Min. : 1.00
## 1st Qu.:113622 1st Qu.:29.00 Class :character 1st Qu.:16.00
## Median :226126 Median :39.00 Mode :character Median :32.00
## Mean :225399 Mean :39.37 Mean :31.26
## 3rd Qu.:337739 3rd Qu.:48.00 3rd Qu.:46.00
## Max. :449999 Max. :65.00 Max. :60.00
## NA's :1 NA's :1 NA's :1
## Usage.Frequency Support.Calls Payment.Delay Subscription.Type
## Min. : 1.00 Min. : 0.000 Min. : 0.00 Length:440833
## 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 6.00 Class :character
## Median :16.00 Median : 3.000 Median :12.00 Mode :character
## Mean :15.81 Mean : 3.604 Mean :12.97
## 3rd Qu.:23.00 3rd Qu.: 6.000 3rd Qu.:19.00
## Max. :30.00 Max. :10.000 Max. :30.00
## NA's :1 NA's :1 NA's :1
## Contract.Length Total.Spend Last.Interaction Churn
## Length:440833 Min. : 100.0 Min. : 1.00 Min. :0.0000
## Class :character 1st Qu.: 480.0 1st Qu.: 7.00 1st Qu.:0.0000
## Mode :character Median : 661.0 Median :14.00 Median :1.0000
## Mean : 631.6 Mean :14.48 Mean :0.5671
## 3rd Qu.: 830.0 3rd Qu.:22.00 3rd Qu.:1.0000
## Max. :1000.0 Max. :30.00 Max. :1.0000
## NA's :1 NA's :1 NA's :1
str(churn)
## 'data.frame': 440833 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : chr "Female" "Female" "Female" "Male" ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: chr "Standard" "Basic" "Basic" "Standard" ...
## $ Contract.Length : chr "Annual" "Monthly" "Quarterly" "Monthly" ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : int 1 1 1 1 1 1 1 1 1 1 ...
head(churn)
## CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1 2 30 Female 39 14 5 18
## 2 3 65 Female 49 1 10 8
## 3 4 55 Female 14 4 6 18
## 4 5 58 Male 38 21 7 7
## 5 6 23 Male 32 20 5 8
## 6 8 51 Male 33 25 9 26
## Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1 Standard Annual 932 17 1
## 2 Basic Monthly 557 6 1
## 3 Basic Quarterly 185 3 1
## 4 Standard Monthly 396 29 1
## 5 Basic Monthly 617 20 1
## 6 Premium Annual 129 8 1
churn$Churn <- as.factor(churn$Churn)
str(churn)
## 'data.frame': 440833 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : chr "Female" "Female" "Female" "Male" ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: chr "Standard" "Basic" "Basic" "Standard" ...
## $ Contract.Length : chr "Annual" "Monthly" "Quarterly" "Monthly" ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
summary(churn)
## CustomerID Age Gender Tenure
## Min. : 2 Min. :18.00 Length:440833 Min. : 1.00
## 1st Qu.:113622 1st Qu.:29.00 Class :character 1st Qu.:16.00
## Median :226126 Median :39.00 Mode :character Median :32.00
## Mean :225399 Mean :39.37 Mean :31.26
## 3rd Qu.:337739 3rd Qu.:48.00 3rd Qu.:46.00
## Max. :449999 Max. :65.00 Max. :60.00
## NA's :1 NA's :1 NA's :1
## Usage.Frequency Support.Calls Payment.Delay Subscription.Type
## Min. : 1.00 Min. : 0.000 Min. : 0.00 Length:440833
## 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 6.00 Class :character
## Median :16.00 Median : 3.000 Median :12.00 Mode :character
## Mean :15.81 Mean : 3.604 Mean :12.97
## 3rd Qu.:23.00 3rd Qu.: 6.000 3rd Qu.:19.00
## Max. :30.00 Max. :10.000 Max. :30.00
## NA's :1 NA's :1 NA's :1
## Contract.Length Total.Spend Last.Interaction Churn
## Length:440833 Min. : 100.0 Min. : 1.00 0 :190833
## Class :character 1st Qu.: 480.0 1st Qu.: 7.00 1 :249999
## Mode :character Median : 661.0 Median :14.00 NA's: 1
## Mean : 631.6 Mean :14.48
## 3rd Qu.: 830.0 3rd Qu.:22.00
## Max. :1000.0 Max. :30.00
## NA's :1 NA's :1
str(churn)
## 'data.frame': 440833 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : chr "Female" "Female" "Female" "Male" ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: chr "Standard" "Basic" "Basic" "Standard" ...
## $ Contract.Length : chr "Annual" "Monthly" "Quarterly" "Monthly" ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
head(churn)
## CustomerID Age Gender Tenure Usage.Frequency Support.Calls Payment.Delay
## 1 2 30 Female 39 14 5 18
## 2 3 65 Female 49 1 10 8
## 3 4 55 Female 14 4 6 18
## 4 5 58 Male 38 21 7 7
## 5 6 23 Male 32 20 5 8
## 6 8 51 Male 33 25 9 26
## Subscription.Type Contract.Length Total.Spend Last.Interaction Churn
## 1 Standard Annual 932 17 1
## 2 Basic Monthly 557 6 1
## 3 Basic Quarterly 185 3 1
## 4 Standard Monthly 396 29 1
## 5 Basic Monthly 617 20 1
## 6 Premium Annual 129 8 1
#erlimina NA
churn <- na.omit(churn)
Entender la base de datos
churn$Churn <- as.factor(churn$Churn)
churn$Contract.Length <- as.factor(churn$Contract.Length)
churn$Subscription.Type <- as.factor(churn$Subscription.Type)
churn$Gender <- as.factor(churn$Gender)
str(churn)
## 'data.frame': 440832 obs. of 12 variables:
## $ CustomerID : int 2 3 4 5 6 8 9 10 11 12 ...
## $ Age : int 30 65 55 58 23 51 58 55 39 64 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 1 1 2 1 ...
## $ Tenure : int 39 49 14 38 32 33 49 37 12 3 ...
## $ Usage.Frequency : int 14 1 4 21 20 25 12 8 5 25 ...
## $ Support.Calls : int 5 10 6 7 5 9 3 4 7 2 ...
## $ Payment.Delay : int 18 8 18 7 8 26 16 15 4 11 ...
## $ Subscription.Type: Factor w/ 3 levels "Basic","Premium",..: 3 1 1 3 1 2 3 2 3 3 ...
## $ Contract.Length : Factor w/ 3 levels "Annual","Monthly",..: 1 2 3 2 2 1 3 1 3 3 ...
## $ Total.Spend : num 932 557 185 396 617 129 821 445 969 415 ...
## $ Last.Interaction : int 17 6 3 29 20 8 24 30 13 29 ...
## $ Churn : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, "na.action")= 'omit' Named int 199296
## ..- attr(*, "names")= chr "199296"
Partir datos 80/20
set.seed(123)
renglones_entrenamiento <- createDataPartition(churn$Churn,p=0.8, list=FALSE)
entrenamiento <- churn[renglones_entrenamiento, ]
prueba <- churn[-renglones_entrenamiento, ]
Modelo de Bosques Aleatorios
modelo <- randomForest(Churn ~ . -CustomerID, data = entrenamiento, ntree = 100, importance = TRUE)
Matrices de Confusion
resultado_entrenamiento <- predict(modelo, entrenamiento)
resultado_prueba <- predict(modelo, prueba)
#Matriz de confusion del resultado del entrenamiento
mcre<- confusionMatrix(resultado_entrenamiento, entrenamiento$Churn)
mcre
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 152667 0
## 1 0 200000
##
## Accuracy : 1
## 95% CI : (1, 1)
## No Information Rate : 0.5671
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4329
## Detection Rate : 0.4329
## Detection Prevalence : 0.4329
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
#matriz de confusuon del resultado de prueba
mcrp <- confusionMatrix(resultado_prueba, prueba$Churn)
mcrp
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38165 7
## 1 1 49992
##
## Accuracy : 0.9999
## 95% CI : (0.9998, 1)
## No Information Rate : 0.5671
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9998
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 1.0000
## Specificity : 0.9999
## Pos Pred Value : 0.9998
## Neg Pred Value : 1.0000
## Prevalence : 0.4329
## Detection Rate : 0.4329
## Detection Prevalence : 0.4330
## Balanced Accuracy : 0.9999
##
## 'Positive' Class : 0
##
Resultados
plot(modelo)

varImpPlot(modelo)

LS0tCnRpdGxlOiAiUmFuZG9tIEZvcmVzdCIKYXV0aG9yOiAiR2l1bGlhbmEgTWFuY2VyYSBGbG9yZXMgLSBBMDA4NDA0MTYiCmRhdGU6ICIyNS4wMi4yMDI2IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IFRSVUUKICAgIHRvY19mbG9hdDogVFJVRQogICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgdGhlbWU6IGNvc21vCi0tLQo8Y2VudGVyPgohW10oaHR0cHM6Ly9yb2JvdGljc2Jpei5jb20vd3AtY29udGVudC91cGxvYWRzLzIwMjIvMDkvcmFuZG9tLWZvcmVzdC5qcGcpCjxjZW50ZXI+CgoKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+IEluc3RhbGFyIHBhcXVldGVzIHkgbGxhbWFyIGxpYnJlcmlhcyA8L3NwYW4+CgpgYGB7ciBtZXNzYWdlcz1GQUxTRSwgd2FybmluZz1GQUxTRX0KI2luc3RhbGwucGFja2FnZXMoImNhcmV0IikKbGlicmFyeShjYXJldCkKI2luc3RhbGwucGFja2FnZXMoInJhbmRvbUZvcmVzdCIpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+IEVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3MgPC9zcGFuPgpgYGB7cn0KI2ZpbGUuY2hvb3NlKCkKY2h1cm4gPC0gcmVhZC5jc3YoIi9Vc2Vycy9naXVsaWFuYS9Eb3dubG9hZHMvY3VzdG9tZXJfY2h1cm4uY3N2IikKaGVhZChjaHVybikKYGBgCgoKYGBge3J9CnN1bW1hcnkoY2h1cm4pCnN0cihjaHVybikKaGVhZChjaHVybikKCmNodXJuJENodXJuIDwtIGFzLmZhY3RvcihjaHVybiRDaHVybikKc3RyKGNodXJuKQpgYGAKYGBge3J9CnN1bW1hcnkoY2h1cm4pCnN0cihjaHVybikKaGVhZChjaHVybikKCiNlcmxpbWluYSBOQQpjaHVybiA8LSBuYS5vbWl0KGNodXJuKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+IEVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3MgPC9zcGFuPgpgYGB7cn0KY2h1cm4kQ2h1cm4gPC0gYXMuZmFjdG9yKGNodXJuJENodXJuKQpjaHVybiRDb250cmFjdC5MZW5ndGggPC0gYXMuZmFjdG9yKGNodXJuJENvbnRyYWN0Lkxlbmd0aCkKY2h1cm4kU3Vic2NyaXB0aW9uLlR5cGUgPC0gYXMuZmFjdG9yKGNodXJuJFN1YnNjcmlwdGlvbi5UeXBlKQpjaHVybiRHZW5kZXIgPC0gYXMuZmFjdG9yKGNodXJuJEdlbmRlcikgCgpzdHIoY2h1cm4pCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7Ij4gUGFydGlyIGRhdG9zIDgwLzIwIDwvc3Bhbj4KYGBge3J9CnNldC5zZWVkKDEyMykKcmVuZ2xvbmVzX2VudHJlbmFtaWVudG8gPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihjaHVybiRDaHVybixwPTAuOCwgbGlzdD1GQUxTRSkKZW50cmVuYW1pZW50byA8LSBjaHVybltyZW5nbG9uZXNfZW50cmVuYW1pZW50bywgXQpwcnVlYmEgPC0gY2h1cm5bLXJlbmdsb25lc19lbnRyZW5hbWllbnRvLCBdCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7Ij4gTW9kZWxvIGRlIEJvc3F1ZXMgQWxlYXRvcmlvcyA8L3NwYW4+CmBgYHtyfQptb2RlbG8gPC0gcmFuZG9tRm9yZXN0KENodXJuIH4gLiAtQ3VzdG9tZXJJRCwgZGF0YSA9IGVudHJlbmFtaWVudG8sIG50cmVlID0gMTAwLCBpbXBvcnRhbmNlID0gVFJVRSkKYGBgCgojIDxzcGFuIHN0eWxlPSJjb2xvcjogYmx1ZTsiPiBNYXRyaWNlcyBkZSBDb25mdXNpb24gPC9zcGFuPgpgYGB7cn0KcmVzdWx0YWRvX2VudHJlbmFtaWVudG8gPC0gcHJlZGljdChtb2RlbG8sIGVudHJlbmFtaWVudG8pCnJlc3VsdGFkb19wcnVlYmEgPC0gcHJlZGljdChtb2RlbG8sIHBydWViYSkKCiNNYXRyaXogZGUgY29uZnVzaW9uIGRlbCByZXN1bHRhZG8gZGVsIGVudHJlbmFtaWVudG8gCm1jcmU8LSBjb25mdXNpb25NYXRyaXgocmVzdWx0YWRvX2VudHJlbmFtaWVudG8sIGVudHJlbmFtaWVudG8kQ2h1cm4pCm1jcmUKCmBgYAoKCmBgYHtyfQojbWF0cml6IGRlIGNvbmZ1c3VvbiBkZWwgcmVzdWx0YWRvIGRlIHBydWViYQptY3JwIDwtIGNvbmZ1c2lvbk1hdHJpeChyZXN1bHRhZG9fcHJ1ZWJhLCBwcnVlYmEkQ2h1cm4pCm1jcnAKYGBgCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+IFJlc3VsdGFkb3MgPC9zcGFuPgpgYGB7cn0KcGxvdChtb2RlbG8pCmBgYApgYGB7cn0KdmFySW1wUGxvdChtb2RlbG8pCmBgYAoK