Se Cargan las librerias
Paso 0 Carga de Datos
datos_modelo <- read.csv("C:/Users/Osvaldo Mora Mora/Documents/Modelo Churn/Modelo Churn publicación Rpubs/Telco-Customer-Churn.csv", header = T, sep = ";")
Paso 1 Analisis Exploratorio de variables
as.data.frame(sort(names(datos_modelo)))
## sort(names(datos_modelo))
## 1 Churn
## 2 Contract
## 3 customerID
## 4 Dependents
## 5 DeviceProtection
## 6 gender
## 7 InternetService
## 8 MonthlyCharges
## 9 MultipleLines
## 10 OnlineBackup
## 11 OnlineSecurity
## 12 PaperlessBilling
## 13 Partner
## 14 PaymentMethod
## 15 PhoneService
## 16 SeniorCitizen
## 17 StreamingMovies
## 18 StreamingTV
## 19 TechSupport
## 20 tenure
## 21 TotalCharges
str(datos_modelo)
## 'data.frame': 7043 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
glimpse(datos_modelo)
## Rows: 7,043
## Columns: 21
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW…
## $ gender <chr> "Female", "Male", "Male", "Male", "Female", "Female",…
## $ SeniorCitizen <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Partner <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes…
## $ Dependents <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"…
## $ tenure <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 2…
## $ PhoneService <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ MultipleLines <chr> "No phone service", "No", "No", "No phone service", "…
## $ InternetService <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber opt…
## $ OnlineSecurity <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "…
## $ OnlineBackup <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "N…
## $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Y…
## $ TechSupport <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes…
## $ StreamingTV <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Ye…
## $ StreamingMovies <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes…
## $ Contract <chr> "Month-to-month", "One year", "Month-to-month", "One …
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ PaymentMethod <chr> "Electronic check", "Mailed check", "Mailed check", "…
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.7…
## $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949…
## $ Churn <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "Y…
# Número de observaciones del set de datos
nrow(datos_modelo)
## [1] 7043
# Detección si hay alguna fila incompleta
any(!complete.cases(datos_modelo))
## [1] TRUE
# Número de datos ausentes por variable
map_dbl(datos_modelo, .f = function(x){sum(is.na(x))})
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
##Como el obetivo es generar un modelo, se prccede a omitir los Na Dejando claro que existen metodos para el tratamiento de los Na
Se procede a trabajar en el formato de algunas variables para facilitar su desarrollo y manipulación al momento que se desarrolla el modelo.
datos_modelo <- na.omit(datos_modelo)
datos_modelo$MultipleLines <- gsub(" ", "_", x= datos_modelo$MultipleLines)
datos_modelo$PaymentMethod <- gsub(" ", "_", x= datos_modelo$PaymentMethod)
datos_modelo$OnlineSecurity <- gsub(" ", "_", x= datos_modelo$OnlineSecurity)
datos_modelo$OnlineBackup <- gsub(" ", "_", x= datos_modelo$OnlineBackup)
datos_modelo$DeviceProtection <- gsub(" ", "_", x= datos_modelo$DeviceProtection)
datos_modelo$TechSupport <- gsub(" ", "_", x= datos_modelo$TechSupport)
datos_modelo$StreamingTV <- gsub(" ", "_", x= datos_modelo$StreamingTV)
datos_modelo$StreamingMovies <- gsub(" ", "_", x= datos_modelo$StreamingMovies)
datos_modelo$Contract <- gsub(" ", "_", x= datos_modelo$Contract)
datos_modelo$Contract <- gsub("-", "_", x= datos_modelo$Contract)
datos_modelo$PaperlessBilling <- gsub(" ", "_", x= datos_modelo$PaperlessBilling)
## SE pasan las variables en factor
datos_modelo$gender <- as.factor(datos_modelo$gender)
datos_modelo$Partner <- as.factor(datos_modelo$Partner)
datos_modelo$PhoneService <- as.factor(datos_modelo$PhoneService)
datos_modelo$MultipleLines <- as.factor(datos_modelo$MultipleLines)
datos_modelo$InternetService <- as.factor(datos_modelo$InternetService)
datos_modelo$OnlineSecurity <- as.factor(datos_modelo$OnlineSecurity)
datos_modelo$OnlineBackup <- as.factor(datos_modelo$OnlineBackup)
datos_modelo$DeviceProtection <- as.factor(datos_modelo$DeviceProtection)
datos_modelo$TechSupport <- as.factor(datos_modelo$TechSupport)
datos_modelo$StreamingTV <- as.factor(datos_modelo$StreamingTV)
datos_modelo$StreamingMovies <- as.factor(datos_modelo$StreamingMovies)
datos_modelo$Contract <- as.factor(datos_modelo$Contract)
datos_modelo$PaperlessBilling <- as.factor(datos_modelo$PaperlessBilling)
datos_modelo$PaymentMethod <- as.factor(datos_modelo$PaymentMethod)
#Proceso de Análisis discritivo de variables Distribución de la variable respuesta u Objetivo
##Distribución de la variable a prediccir###
##Analisis del problema de desvalanceo##
ggplot(data = datos_modelo,
mapping = aes(x =Churn, fill = Churn)) +
geom_bar() + scale_fill_manual(values = c("cadetblue2","brown1"),
name = "Estado Actual") +
labs(x = "", y = "Cantidad de individuos",
title = "Distribución de la variable Objetivo") +
theme_minimal() +
theme(legend.position = "bottom")
##Analisis de variables Númericas
datoscont <- datos_modelo %>% select(Churn, SeniorCitizen, tenure, MonthlyCharges, TotalCharges)
Analisiscont<-function(x) {
nombres <- as.data.frame(colnames(datoscont))
nombrest <- t(nombres)
p1 <- ggplot(data = datos_modelo, aes(x = datoscont[,i], fill =Churn)) +
geom_density(alpha = 0.5) +
labs(title=paste("Densidad variable",nombrest[,i]),x=nombrest[,i], y = "Densidad") +
scale_fill_manual(values = c("gray50", "#1874CD")) +
geom_rug(aes(color = Churn), alpha = 0.5) +
scale_color_manual(values = c("gray50", "#1874CD")) +
theme_bw()
p2 <- ggplot(data = datos_modelo, aes(x = Churn, y = datoscont[,i], color = Churn)) +
geom_boxplot(outlier.shape = NA) +
labs(title=paste("Boxplot Variable",nombrest[,i]),x=nombrest[,i], y = "Boxplot") +
geom_jitter(alpha = 0.3, width = 0.15) +
scale_color_manual(values = c("gray50", "#1874CD", "beige")) +
theme_bw()
final_plot <- ggarrange(p1, p2, legend = "top")
final_plot <- annotate_figure(final_plot, top = text_grob("Grafico densidad y boxplot", size = 15))
final_plot
datoscont %>% filter(!is.na(datoscont[,i])) %>% group_by(Churn)
x <- subset(datoscont, Churn == "Yes")
x <- na.omit(x)
SI <- x %>% summarise(media = mean(x[,i]),
mediana = median(x[,i]),
min = min(x[,i]),
max = max(x[,i]))
y <- subset(datoscont, Churn == "No")
y <- na.omit(y)
NO <- y %>% summarise(media = mean(y[,i]),
mediana = median(y[,i]),
min = min(y[,i]),
max = max(y[,i]))
final = as.data.frame(rbind(SI,NO))
rownames(final) <- c("Yes","No")
print(final)
boxplot(datoscont[,i], main=paste("Boxplot Variable",nombrest[,i]))
plot(final_plot)
}
inicio <- 2
fin <-dim(datoscont)[2]
for(i in inicio:fin)
{
Analisiscont(datoscont[,i])
}
## media mediana min max
## Yes 0.2546816 0 0 1
## No 0.1289948 0 0 1
## media mediana min max
## Yes 17.97913 10 1 72
## No 37.65001 38 1 72
## media mediana min max
## Yes 74.44133 79.65 18.85 118.35
## No 61.30741 64.45 18.25 118.75
## media mediana min max
## Yes 1531.796 703.55 18.85 8684.80
## No 2555.344 1683.60 18.80 8672.45
#Analisis de variables cualitativas
datoscat <- select(datos_modelo, -customerID, -SeniorCitizen, -tenure, -MonthlyCharges, -TotalCharges)
predictora <- select(datos_modelo, Churn)
datoscat <- select(datoscat, -Churn)
datoscat <- cbind(predictora, datoscat)
Analisiscat<-function(x) {
nombres <- as.data.frame(colnames(datoscat))
nombrest <- t(nombres)
g <- ggplot(data = datoscat, aes(x = datoscat[,i], y = ..count.., fill = Churn)) +
geom_bar() +
labs(title=paste("Grafico de barras variable",nombrest[,i]),x=nombrest[,i], y = "Cantidad") +
scale_fill_manual(values = c("gray50", "#1874CD")) +
theme_bw() +
theme(legend.position = "bottom")
# Tabla de frecuencias relativas para la variable
c <- table(datoscat[,i], datos_modelo$Churn) #INCLUIR PARA VER LA CANTIDAD ABSOLUTA
p <- prop.table(table(datoscat[,i], datos_modelo$Churn), margin = 1) %>% round(digits = 2)
plot(g)
print(c)
print(p)
}
#For para corrida de las variables
inicio <- 2
fin <- dim(datoscat)[2]
for(i in inicio:fin)
{
Analisiscat(datoscat[,i])
}
##
## No Yes
## Female 2544 939
## Male 2619 930
##
## No Yes
## Female 0.73 0.27
## Male 0.74 0.26
##
## No Yes
## No 2439 1200
## Yes 2724 669
##
## No Yes
## No 0.67 0.33
## Yes 0.80 0.20
##
## No Yes
## No 3390 1543
## Yes 1773 326
##
## No Yes
## No 0.69 0.31
## Yes 0.84 0.16
##
## No Yes
## No 510 170
## Yes 4653 1699
##
## No Yes
## No 0.75 0.25
## Yes 0.73 0.27
##
## No Yes
## No 2536 849
## No_phone_service 510 170
## Yes 2117 850
##
## No Yes
## No 0.75 0.25
## No_phone_service 0.75 0.25
## Yes 0.71 0.29
##
## No Yes
## DSL 1957 459
## Fiber optic 1799 1297
## No 1407 113
##
## No Yes
## DSL 0.81 0.19
## Fiber optic 0.58 0.42
## No 0.93 0.07
##
## No Yes
## No 2036 1461
## No_internet_service 1407 113
## Yes 1720 295
##
## No Yes
## No 0.58 0.42
## No_internet_service 0.93 0.07
## Yes 0.85 0.15
##
## No Yes
## No 1854 1233
## No_internet_service 1407 113
## Yes 1902 523
##
## No Yes
## No 0.60 0.40
## No_internet_service 0.93 0.07
## Yes 0.78 0.22
##
## No Yes
## No 1883 1211
## No_internet_service 1407 113
## Yes 1873 545
##
## No Yes
## No 0.61 0.39
## No_internet_service 0.93 0.07
## Yes 0.77 0.23
##
## No Yes
## No 2026 1446
## No_internet_service 1407 113
## Yes 1730 310
##
## No Yes
## No 0.58 0.42
## No_internet_service 0.93 0.07
## Yes 0.85 0.15
##
## No Yes
## No 1867 942
## No_internet_service 1407 113
## Yes 1889 814
##
## No Yes
## No 0.66 0.34
## No_internet_service 0.93 0.07
## Yes 0.70 0.30
##
## No Yes
## No 1843 938
## No_internet_service 1407 113
## Yes 1913 818
##
## No Yes
## No 0.66 0.34
## No_internet_service 0.93 0.07
## Yes 0.70 0.30
##
## No Yes
## Month_to_month 2220 1655
## One_year 1306 166
## Two_year 1637 48
##
## No Yes
## Month_to_month 0.57 0.43
## One_year 0.89 0.11
## Two_year 0.97 0.03
##
## No Yes
## No 2395 469
## Yes 2768 1400
##
## No Yes
## No 0.84 0.16
## Yes 0.66 0.34
##
## No Yes
## Bank_transfer_(automatic) 1284 258
## Credit_card_(automatic) 1289 232
## Electronic_check 1294 1071
## Mailed_check 1296 308
##
## No Yes
## Bank_transfer_(automatic) 0.83 0.17
## Credit_card_(automatic) 0.85 0.15
## Electronic_check 0.55 0.45
## Mailed_check 0.81 0.19
##Análisis de Importancia de variables
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
datos_rf <- datos_modelo %>%
select(-customerID) %>%
na.omit()
datos_rf <- map_if(.x = datos_rf, .p = is.character, .f = as.factor) %>%
as.data.frame()
modelo_randforest <- randomForest(formula = Churn ~ . ,
data = na.omit(datos_rf),
mtry = 5,
importance = TRUE,
ntree = 1000)
importancia <- as.data.frame(modelo_randforest$importance)
importancia <- rownames_to_column(importancia,var = "variable")
p1 <- ggplot(data = importancia, aes(x = reorder(variable, MeanDecreaseAccuracy),
y = MeanDecreaseAccuracy,
fill = MeanDecreaseAccuracy)) +
labs(x = "variable", title = "Reducción de Accuracy") +
geom_col() +
coord_flip() +
theme_bw() +
theme(legend.position = "bottom")
p2 <- ggplot(data = importancia, aes(x = reorder(variable, MeanDecreaseGini),
y = MeanDecreaseGini,
fill = MeanDecreaseGini)) +
labs(x = "variable", title = "Reducción de pureza (Gini)") +
geom_col() +
coord_flip() +
theme_bw() +
theme(legend.position = "bottom")
ggarrange(p1, p2)
Conclusión: Del análisis anterior, se deriva las siguientes variables para modelar:
1- TotalCharges 2- MonthlyCharges 3- tenure 4- Contract 5- PaymentMethod 6- OnlineSecurity 7- TechSupport 8- InternetService
#Modelamiento
base_final_modelo <- select(datos_modelo,
Churn,
TotalCharges,
tenure,
Contract,
PaymentMethod,
OnlineSecurity,
TechSupport,
InternetService
)
set.seed(123)
# Se crean los índices de las observaciones de entrenamiento
train <- createDataPartition(y = base_final_modelo$Churn, p = 0.8, list = FALSE, times = 1)
datos_train <- base_final_modelo[train, ]
datos_test <- base_final_modelo[-train, ]
prop.table(table(datos_train$Churn))
##
## No Yes
## 0.734139 0.265861
prop.table(table(datos_test$Churn))
##
## No Yes
## 0.7345196 0.2654804
##Transformación Dummy
##Se usara librería recipes
library(recipes)
## Warning: package 'recipes' was built under R version 4.2.1
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
objeto_recipe <- recipe(formula = Churn ~.,
data = datos_train)
objeto_recipe
## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 7
objeto_recipe <- objeto_recipe %>% step_dummy(all_nominal(), -all_outcomes())
trained_recipe <- prep(objeto_recipe, training = datos_train)
trained_recipe
## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 7
##
## Training data contained 5627 data points and no missing data.
##
## Operations:
##
## Dummy variables from Contract, PaymentMethod, OnlineSecurity, TechSupport, Int... [trained]
# Se aplican las transformaciones al conjunto de entrenamiento y de test
datos_train_prep <- bake(trained_recipe, new_data = datos_train)
datos_test_prep <- bake(trained_recipe, new_data = datos_test)
glimpse(datos_train_prep)
## Rows: 5,627
## Columns: 14
## $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75,…
## $ tenure <int> 1, 34, 2, 45, 2, 8, 10, 13, 58, …
## $ Churn <fct> No, No, Yes, No, Yes, Yes, No, N…
## $ Contract_One_year <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ Contract_Two_year <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ PaymentMethod_Credit_card_.automatic. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,…
## $ PaymentMethod_Electronic_check <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,…
## $ PaymentMethod_Mailed_check <dbl> 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,…
## $ OnlineSecurity_No_internet_service <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ OnlineSecurity_Yes <dbl> 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,…
## $ TechSupport_No_internet_service <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ TechSupport_Yes <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,…
## $ InternetService_Fiber.optic <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,…
## $ InternetService_No <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
Se inicia proceso de modelos Modelo KNN
##Se Inicia el proceso de modelado
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- data.frame(k = c(1, 2, 5, 10, 15, 20, 30, 50))
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = TRUE)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_knn <- train(Churn ~ ., data = datos_train_prep,
method = "knn",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train)
modelo_knn
## k-Nearest Neighbors
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.7094338 0.2627058
## 2 0.7091177 0.2605544
## 5 0.7520541 0.3116591
## 10 0.7547546 0.2973680
## 15 0.7541859 0.2728730
## 20 0.7534402 0.2586785
## 30 0.7516267 0.2153460
## 50 0.7487468 0.1632502
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.
# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_knn, highlight = TRUE) +
scale_x_continuous(breaks = hiperparametros$k) +
labs(title = "Evolución del accuracy del modelo KNN", x = "K") +
theme_bw()
Modelo Naive Bayes
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- data.frame(usekernel = FALSE, fL = 0 , adjust = 0)
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = F)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_nb <- train(Churn ~ ., data = datos_train_prep,
method = "nb",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train)
modelo_nb
## Naive Bayes
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7083022 0.3974846
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'usekernel' was held constant at a value of FALSE
## Tuning
## parameter 'adjust' was held constant at a value of 0
Modelo: Regresión Logistica
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- data.frame(parameter = "none")
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = TRUE)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_logistic <- train(Churn ~ ., data = datos_train_prep,
method = "glm",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train,
family = "binomial")
modelo_logistic
## Generalized Linear Model
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7944944 0.441462
summary(modelo_logistic$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6287 -0.6832 -0.2974 0.7882 3.3504
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) 0.00687108 0.12639968 0.054
## TotalCharges 0.00040599 0.00007141 5.685
## tenure -0.06360217 0.00672134 -9.463
## Contract_One_year -0.71615138 0.11729818 -6.105
## Contract_Two_year -1.51183448 0.19640933 -7.697
## PaymentMethod_Credit_card_.automatic. 0.01754077 0.12501386 0.140
## PaymentMethod_Electronic_check 0.36586878 0.10339514 3.539
## PaymentMethod_Mailed_check -0.19157921 0.12571149 -1.524
## OnlineSecurity_No_internet_service -1.09871254 0.14234678 -7.719
## OnlineSecurity_Yes -0.48222793 0.09307387 -5.181
## TechSupport_No_internet_service NA NA NA
## TechSupport_Yes -0.35997033 0.09407138 -3.827
## InternetService_Fiber.optic 0.67212528 0.09967481 6.743
## InternetService_No NA NA NA
## Pr(>|z|)
## (Intercept) 0.956648
## TotalCharges 0.0000000130869939 ***
## tenure < 0.0000000000000002 ***
## Contract_One_year 0.0000000010254845 ***
## Contract_Two_year 0.0000000000000139 ***
## PaymentMethod_Credit_card_.automatic. 0.888415
## PaymentMethod_Electronic_check 0.000402 ***
## PaymentMethod_Mailed_check 0.127519
## OnlineSecurity_No_internet_service 0.0000000000000118 ***
## OnlineSecurity_Yes 0.0000002205442449 ***
## TechSupport_No_internet_service NA
## TechSupport_Yes 0.000130 ***
## InternetService_Fiber.optic 0.0000000000154956 ***
## InternetService_No NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6517.2 on 5626 degrees of freedom
## Residual deviance: 4748.3 on 5615 degrees of freedom
## AIC: 4772.3
##
## Number of Fisher Scoring iterations: 6
Modelo LDA
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- data.frame(parameter = "none")
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = TRUE)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_lda <- train(Churn ~ ., data = datos_train_prep,
method = "lda",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train)
modelo_lda
## Linear Discriminant Analysis
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results:
##
## Accuracy Kappa
## 0.793572 0.4482773
modelo_lda$finalModel
## Call:
## lda(x, grouping = y)
##
## Prior probabilities of groups:
## No Yes
## 0.734139 0.265861
##
## Group means:
## TotalCharges tenure Contract_One_year Contract_Two_year
## No 2562.370 37.67901 0.25006052 0.31905108
## Yes 1536.372 17.97460 0.08957219 0.02540107
## PaymentMethod_Credit_card_.automatic. PaymentMethod_Electronic_check
## No 0.2447349 0.2500605
## Yes 0.1303476 0.5661765
## PaymentMethod_Mailed_check OnlineSecurity_No_internet_service
## No 0.2532075 0.27184701
## Yes 0.1624332 0.05949198
## OnlineSecurity_Yes TechSupport_No_internet_service TechSupport_Yes
## No 0.3326071 0.27184701 0.3292181
## Yes 0.1590909 0.05949198 0.1684492
## InternetService_Fiber.optic InternetService_No
## No 0.3524570 0.27184701
## Yes 0.6938503 0.05949198
##
## Coefficients of linear discriminants:
## LD1
## TotalCharges -0.0001259866
## tenure -0.0135844702
## Contract_One_year -0.5702482927
## Contract_Two_year -0.3970764027
## PaymentMethod_Credit_card_.automatic. 0.0186664183
## PaymentMethod_Electronic_check 0.4103846868
## PaymentMethod_Mailed_check -0.1556114816
## OnlineSecurity_No_internet_service -0.3123079295
## OnlineSecurity_Yes -0.3542029605
## TechSupport_No_internet_service -0.3123079295
## TechSupport_Yes -0.2445416912
## InternetService_Fiber.optic 0.9512969841
## InternetService_No -0.3123079295
Modelo: Arbol de clasificación simple
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- data.frame(parameter = "none")
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = TRUE)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_C50Tree <- train(Churn~ ., data = datos_train_prep,
method = "C5.0Tree",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train)
modelo_C50Tree
## Single C5.0 Tree
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7918968 0.4493307
summary(modelo_C50Tree$finalModel)
##
## Call:
## C50:::C5.0.default(x = x, y = y, weights = wts)
##
##
## C5.0 [Release 2.07 GPL Edition] Thu Oct 27 14:18:33 2022
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 5627 cases (14 attributes) from undefined.data
##
## Decision tree:
##
## Contract_Two_year > 0: No (1356/38)
## Contract_Two_year <= 0:
## :...Contract_One_year > 0: No (1167/134)
## Contract_One_year <= 0:
## :...InternetService_Fiber.optic <= 0:
## :...tenure > 5: No (800/143)
## : tenure <= 5:
## : :...OnlineSecurity_No_internet_service > 0: No (222/59)
## : OnlineSecurity_No_internet_service <= 0:
## : :...TechSupport_Yes <= 0: Yes (302/136)
## : TechSupport_Yes > 0: No (71/27)
## InternetService_Fiber.optic > 0:
## :...tenure <= 15: Yes (833/260)
## tenure > 15:
## :...tenure > 54: No (143/32)
## tenure <= 54:
## :...PaymentMethod_Electronic_check <= 0: No (331/116)
## PaymentMethod_Electronic_check > 0:
## :...TechSupport_Yes <= 0: Yes (324/148)
## TechSupport_Yes > 0: No (78/32)
##
##
## Evaluation on training data (5627 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 11 1125(20.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3587 544 (a): class No
## 581 915 (b): class Yes
##
##
## Attribute usage:
##
## 100.00% Contract_Two_year
## 75.90% Contract_One_year
## 55.16% tenure
## 55.16% InternetService_Fiber.optic
## 13.77% TechSupport_Yes
## 13.03% PaymentMethod_Electronic_check
## 10.57% OnlineSecurity_No_internet_service
##
##
## Time: 0.0 secs
Modelo: Ramdom Forest
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- expand.grid(mtry = c(3, 4, 5, 7),
min.node.size = c(2, 3, 4, 5, 10, 15, 20, 30),
splitrule = "gini")
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = TRUE)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_rf <- train(Churn ~ ., data = datos_train_prep,
method = "ranger",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train,
# Número de árboles ajustados
num.trees = 500)
modelo_rf
## Random Forest
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results across tuning parameters:
##
## mtry min.node.size Accuracy Kappa
## 3 2 0.7958807 0.4450180
## 3 3 0.7954187 0.4430749
## 3 4 0.7961637 0.4453089
## 3 5 0.7962350 0.4455455
## 3 10 0.7957022 0.4440247
## 3 15 0.7953476 0.4429570
## 3 20 0.7954888 0.4429158
## 3 30 0.7965906 0.4464387
## 4 2 0.7957727 0.4501614
## 4 3 0.7958792 0.4500688
## 4 4 0.7965551 0.4518373
## 4 5 0.7965901 0.4517354
## 4 10 0.7967332 0.4520130
## 4 15 0.7971947 0.4530559
## 4 20 0.7971590 0.4522873
## 4 30 0.7977997 0.4543360
## 5 2 0.7932485 0.4475451
## 5 3 0.7929997 0.4461445
## 5 4 0.7935686 0.4475773
## 5 5 0.7937462 0.4474147
## 5 10 0.7937817 0.4466109
## 5 15 0.7946703 0.4477403
## 5 20 0.7953818 0.4493334
## 5 30 0.7972309 0.4529959
## 7 2 0.7824795 0.4224128
## 7 3 0.7837946 0.4239638
## 7 4 0.7848242 0.4267782
## 7 5 0.7859975 0.4289385
## 7 10 0.7879509 0.4315076
## 7 15 0.7891610 0.4321659
## 7 20 0.7891954 0.4298566
## 7 30 0.7913283 0.4328507
##
## Tuning parameter 'splitrule' was held constant at a value of gini
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
## and min.node.size = 30.
modelo_rf$finalModel
## Ranger result
##
## Call:
## ranger::ranger(dependent.variable.name = ".outcome", data = x, mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size, splitrule = as.character(param$splitrule), write.forest = TRUE, probability = classProbs, ...)
##
## Type: Classification
## Number of trees: 500
## Sample size: 5627
## Number of independent variables: 13
## Mtry: 4
## Target node size: 30
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error: 20.12 %
# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_rf, highlight = TRUE) +
scale_x_continuous(breaks = 1:30) +
labs(title = "Evolución del accuracy del modelo Random Forest") +
guides(color = guide_legend(title = "mtry"),
shape = guide_legend(title = "mtry")) +
theme_bw()
Modelo Gradient Boosting
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- expand.grid(interaction.depth = c(1, 2),
n.trees = c(500, 1000, 2000),
shrinkage = c(0.001, 0.01, 0.1),
n.minobsinnode = c(2, 5, 15))
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = TRUE)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_boost <- train(Churn ~ ., data = datos_train_prep,
method = "gbm",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train,
# Número de árboles ajustados
distribution = "adaboost",
verbose = FALSE)
modelo_boost
## Stochastic Gradient Boosting
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode n.trees Accuracy Kappa
## 0.001 1 2 500 0.7341395 0.0000000
## 0.001 1 2 1000 0.7341395 0.0000000
## 0.001 1 2 2000 0.7623250 0.1871962
## 0.001 1 5 500 0.7341395 0.0000000
## 0.001 1 5 1000 0.7341395 0.0000000
## 0.001 1 5 2000 0.7621829 0.1864404
## 0.001 1 15 500 0.7341395 0.0000000
## 0.001 1 15 1000 0.7341395 0.0000000
## 0.001 1 15 2000 0.7622184 0.1867043
## 0.001 2 2 500 0.7341395 0.0000000
## 0.001 2 2 1000 0.7341395 0.0000000
## 0.001 2 2 2000 0.7847554 0.3378705
## 0.001 2 5 500 0.7341395 0.0000000
## 0.001 2 5 1000 0.7341395 0.0000000
## 0.001 2 5 2000 0.7846841 0.3375264
## 0.001 2 15 500 0.7341395 0.0000000
## 0.001 2 15 1000 0.7341395 0.0000000
## 0.001 2 15 2000 0.7848975 0.3384592
## 0.010 1 2 500 0.7888076 0.3656672
## 0.010 1 2 1000 0.7928607 0.4064255
## 0.010 1 2 2000 0.7946712 0.4259312
## 0.010 1 5 500 0.7887366 0.3653983
## 0.010 1 5 1000 0.7926116 0.4057564
## 0.010 1 5 2000 0.7945291 0.4255129
## 0.010 1 15 500 0.7887364 0.3655223
## 0.010 1 15 1000 0.7926822 0.4057301
## 0.010 1 15 2000 0.7945295 0.4254822
## 0.010 2 2 500 0.7949201 0.4228465
## 0.010 2 2 1000 0.7952762 0.4348609
## 0.010 2 2 2000 0.7955957 0.4409645
## 0.010 2 5 500 0.7947064 0.4219485
## 0.010 2 5 1000 0.7958452 0.4375691
## 0.010 2 5 2000 0.7958797 0.4423048
## 0.010 2 15 500 0.7950270 0.4224679
## 0.010 2 15 1000 0.7957384 0.4378673
## 0.010 2 15 2000 0.7958441 0.4425964
## 0.100 1 2 500 0.7932150 0.4275242
## 0.100 1 2 1000 0.7930707 0.4289401
## 0.100 1 2 2000 0.7917553 0.4284967
## 0.100 1 5 500 0.7950611 0.4331179
## 0.100 1 5 1000 0.7929287 0.4295684
## 0.100 1 5 2000 0.7925026 0.4304894
## 0.100 1 15 500 0.7942814 0.4303992
## 0.100 1 15 1000 0.7943871 0.4330570
## 0.100 1 15 2000 0.7931058 0.4316006
## 0.100 2 2 500 0.7921830 0.4355337
## 0.100 2 2 1000 0.7900478 0.4330814
## 0.100 2 2 2000 0.7866348 0.4247116
## 0.100 2 5 500 0.7917204 0.4355998
## 0.100 2 5 1000 0.7911854 0.4354312
## 0.100 2 5 2000 0.7870610 0.4261621
## 0.100 2 15 500 0.7922884 0.4360380
## 0.100 2 15 1000 0.7892317 0.4310267
## 0.100 2 15 2000 0.7874897 0.4263105
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 2000, interaction.depth =
## 2, shrinkage = 0.01 and n.minobsinnode = 5.
# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_boost, highlight = TRUE) +
labs(title = "Evolución del accuracy del modelo Gradient Boosting") +
guides(color = guide_legend(title = "shrinkage"),
shape = guide_legend(title = "shrinkage")) +
theme_bw() +
theme(legend.position = "bottom")
Modelo: Soporte Vectorial
# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones <- 10
repeticiones <- 5
# Hiperparámetros
hiperparametros <- expand.grid(sigma = c(0.001, 0.01, 0.1, 0.5, 1),
C = c(1 , 20, 50, 100, 200, 500, 700))
set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)
# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, seeds = seeds,
returnResamp = "final", verboseIter = FALSE,
allowParallel = TRUE)
# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_svmrad <- train(Churn ~ ., data = datos_train_prep,
method = "svmRadial",
tuneGrid = hiperparametros,
metric = "Accuracy",
trControl = control_train)
modelo_svmrad
## Support Vector Machines with Radial Basis Function Kernel
##
## 5627 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ...
## Resampling results across tuning parameters:
##
## sigma C Accuracy Kappa
## 0.001 1 0.7946376 0.4447396
## 0.001 20 0.7902657 0.4126974
## 0.001 50 0.7895897 0.3976770
## 0.001 100 0.7894830 0.3933122
## 0.001 200 0.7878133 0.3919951
## 0.001 500 0.7887724 0.3987108
## 0.001 700 0.7890920 0.4009527
## 0.010 1 0.7872785 0.3846160
## 0.010 20 0.7875637 0.3967441
## 0.010 50 0.7877770 0.3941641
## 0.010 100 0.7885587 0.4010363
## 0.010 200 0.7888432 0.4099250
## 0.010 500 0.7891272 0.4208492
## 0.010 700 0.7894121 0.4242435
## 0.100 1 0.7914398 0.4268918
## 0.100 20 0.7883799 0.4267513
## 0.100 50 0.7868873 0.4296732
## 0.100 100 0.7885222 0.4377427
## 0.100 200 0.7900494 0.4474032
## 0.100 500 0.7894821 0.4499308
## 0.100 700 0.7898729 0.4518908
## 0.500 1 0.7907981 0.4306511
## 0.500 20 0.7895893 0.4517241
## 0.500 50 0.7864257 0.4457784
## 0.500 100 0.7832271 0.4390163
## 0.500 200 0.7800997 0.4341689
## 0.500 500 0.7742723 0.4227378
## 0.500 700 0.7726727 0.4198318
## 1.000 1 0.7901578 0.4323429
## 1.000 20 0.7827299 0.4366312
## 1.000 50 0.7790345 0.4319732
## 1.000 100 0.7742013 0.4241068
## 1.000 200 0.7716069 0.4209782
## 1.000 500 0.7666675 0.4145062
## 1.000 700 0.7636809 0.4101711
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.001 and C = 1.
modelo_svmrad$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.001
##
## Number of Support Vectors : 2928
##
## Objective Function Value : -2792.835
## Training error : 0.204016
# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_svmrad, highlight = TRUE) +
labs(title = "Evolución del accuracy del modelo SVM Radial") +
theme_bw()
Proceso de comparación de modelos Este proceso consiste en comparar los
difeentes resultados de los modelos obtenidos para determinar cual de
los modelos cumplen con los KPI´s desados.
modelos <- list(KNN = modelo_knn, NB = modelo_nb, logistic = modelo_logistic,
LDA = modelo_lda, arbol = modelo_C50Tree, rf = modelo_rf,
boosting = modelo_boost, SVMradial = modelo_svmrad)
resultados_resamples <- resamples(modelos)
resultados_resamples$values %>% head(10)
## Resample KNN~Accuracy KNN~Kappa NB~Accuracy NB~Kappa logistic~Accuracy
## 1 Fold01.Rep1 0.7531083 0.2848997 0.7015986 0.3915863 0.7904085
## 2 Fold01.Rep2 0.7508897 0.2974122 0.7241993 0.4243401 0.7864769
## 3 Fold01.Rep3 0.7513321 0.2780200 0.7069272 0.3868924 0.7904085
## 4 Fold01.Rep4 0.7531083 0.2950645 0.7122558 0.4011582 0.7868561
## 5 Fold01.Rep5 0.7313167 0.2182508 0.6814947 0.3443565 0.7811388
## 6 Fold02.Rep1 0.7371226 0.2650908 0.7069272 0.3826057 0.7779751
## 7 Fold02.Rep2 0.7602131 0.3185792 0.7069272 0.4054682 0.7921847
## 8 Fold02.Rep3 0.7429078 0.2918745 0.6879433 0.3653522 0.7996454
## 9 Fold02.Rep4 0.7566607 0.3117298 0.6820604 0.3593366 0.7655417
## 10 Fold02.Rep5 0.7375887 0.2788721 0.6666667 0.3377305 0.7677305
## logistic~Kappa LDA~Accuracy LDA~Kappa arbol~Accuracy arbol~Kappa rf~Accuracy
## 1 0.4375434 0.7904085 0.4473872 0.8028419 0.4902068 0.7939609
## 2 0.4116518 0.7740214 0.3899762 0.7615658 0.3178747 0.7846975
## 3 0.4324888 0.7904085 0.4350274 0.7904085 0.4521811 0.7850799
## 4 0.4254516 0.7815275 0.4176009 0.7886323 0.4463378 0.7957371
## 5 0.4091896 0.7740214 0.4007925 0.7722420 0.3837825 0.7793594
## 6 0.3947434 0.7779751 0.4107378 0.7797513 0.4268120 0.7850799
## 7 0.4410654 0.7921847 0.4508691 0.7886323 0.4463378 0.7921847
## 8 0.4580612 0.7996454 0.4699601 0.7890071 0.4652964 0.7978723
## 9 0.3594001 0.7619893 0.3612781 0.7637655 0.3674531 0.7655417
## 10 0.3855289 0.7659574 0.3821948 0.7854610 0.4493610 0.7836879
## rf~Kappa boosting~Accuracy boosting~Kappa SVMradial~Accuracy
## 1 0.4519578 0.7957371 0.4456716 0.7921847
## 2 0.4081121 0.7740214 0.3671283 0.7793594
## 3 0.4320954 0.7992895 0.4577541 0.7904085
## 4 0.4380799 0.7939609 0.4370485 0.7850799
## 5 0.4083909 0.7740214 0.3844203 0.7864769
## 6 0.4270708 0.7761989 0.3912782 0.7779751
## 7 0.4460106 0.7939609 0.4446032 0.8010657
## 8 0.4617239 0.7960993 0.4434720 0.8049645
## 9 0.3594001 0.7708703 0.3667815 0.7548845
## 10 0.4339585 0.7836879 0.4188121 0.7624113
## SVMradial~Kappa
## 1 0.4484505
## 2 0.3948278
## 3 0.4324888
## 4 0.4219565
## 5 0.4222961
## 6 0.4028477
## 7 0.4637548
## 8 0.4783047
## 9 0.3363004
## 10 0.3616461
metricas_resamples <- resultados_resamples$values %>%
gather(key = "modelo", value = "valor", -Resample) %>%
separate(col = "modelo", into = c("modelo", "metrica"),
sep = "~", remove = TRUE)
metricas_resamples %>% head()
## Resample modelo metrica valor
## 1 Fold01.Rep1 KNN Accuracy 0.7531083
## 2 Fold01.Rep2 KNN Accuracy 0.7508897
## 3 Fold01.Rep3 KNN Accuracy 0.7513321
## 4 Fold01.Rep4 KNN Accuracy 0.7531083
## 5 Fold01.Rep5 KNN Accuracy 0.7313167
## 6 Fold02.Rep1 KNN Accuracy 0.7371226
Accuracy y Kappa promedio de cada modelo
metricas_resamples %>%
group_by(modelo, metrica) %>%
summarise(media = mean(valor)) %>%
spread(key = metrica, value = media) %>%
arrange(desc(Accuracy))
## `summarise()` has grouped output by 'modelo'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 3
## # Groups: modelo [8]
## modelo Accuracy Kappa
## <chr> <dbl> <dbl>
## 1 rf 0.798 0.454
## 2 boosting 0.796 0.442
## 3 SVMradial 0.795 0.445
## 4 logistic 0.794 0.441
## 5 LDA 0.794 0.448
## 6 arbol 0.792 0.449
## 7 KNN 0.755 0.297
## 8 NB 0.708 0.397
Representación Gráfica
metricas_resamples %>%
filter(metrica == "Accuracy") %>%
group_by(modelo) %>%
summarise(media = mean(valor)) %>%
ggplot(aes(x = reorder(modelo, media), y = media, label = round(media, 2))) +
geom_segment(aes(x = reorder(modelo, media), y = 0,
xend = modelo, yend = media),
color = "grey50") +
geom_point(size = 7, color = "firebrick") +
geom_text(color = "white", size = 2.5) +
scale_y_continuous(limits = c(0, 1)) +
# Accuracy basal
geom_hline(yintercept = 0.62, linetype = "dashed") +
annotate(geom = "text", y = 0.72, x = 8.5, label = "Accuracy basal") +
labs(title = "Validación: Accuracy medio repeated-CV",
subtitle = "Modelos ordenados por media",
x = "modelo") +
coord_flip() +
theme_bw()
metricas_resamples %>% filter(metrica == "Accuracy") %>%
group_by(modelo) %>%
mutate(media = mean(valor)) %>%
ungroup() %>%
ggplot(aes(x = reorder(modelo, media), y = valor, color = modelo)) +
geom_boxplot(alpha = 0.6, outlier.shape = NA) +
geom_jitter(width = 0.1, alpha = 0.6) +
scale_y_continuous(limits = c(0, 1)) +
# Accuracy basal
geom_hline(yintercept = 0.62, linetype = "dashed") +
annotate(geom = "text", y = 0.65, x = 8.5, label = "Accuracy basal") +
theme_bw() +
labs(title = "Validación: Accuracy medio repeated-CV",
subtitle = "Modelos ordenados por media") +
coord_flip() +
theme(legend.position = "none")
# Comparaciones múltiples con un test suma de rangos de Wilcoxon
# ==============================================================================
metricas_accuracy <- metricas_resamples %>% filter(metrica == "Accuracy")
comparaciones <- pairwise.wilcox.test(x = metricas_accuracy$valor,
g = metricas_accuracy$modelo,
paired = TRUE,
p.adjust.method = "holm")
# Se almacenan los p_values en forma de dataframe
comparaciones <- comparaciones$p.value %>%
as.data.frame() %>%
rownames_to_column(var = "modeloA") %>%
gather(key = "modeloB", value = "p_value", -modeloA) %>%
na.omit() %>%
arrange(modeloA)
comparaciones
## modeloA modeloB p_value
## 1 boosting arbol 0.02764486331395
## 2 KNN arbol 0.00000002169547
## 3 KNN boosting 0.00000002169547
## 4 LDA arbol 1.00000000000000
## 5 LDA boosting 0.86201775379628
## 6 LDA KNN 0.00000002169547
## 7 logistic arbol 0.86201775379628
## 8 logistic boosting 1.00000000000000
## 9 logistic KNN 0.00000002169547
## 10 logistic LDA 1.00000000000000
## 11 NB arbol 0.00000002169547
## 12 NB boosting 0.00000002169547
## 13 NB KNN 0.00000002169547
## 14 NB LDA 0.00000002169547
## 15 NB logistic 0.00000002169547
## 16 rf arbol 0.00053500316876
## 17 rf boosting 0.86201775379628
## 18 rf KNN 0.00000002169547
## 19 rf LDA 0.04574417935984
## 20 rf logistic 0.30426220952912
## 21 rf NB 0.00000002169547
## 22 SVMradial arbol 0.88300816132906
## 23 SVMradial boosting 1.00000000000000
## 24 SVMradial KNN 0.00000002169547
## 25 SVMradial LDA 0.88300816132906
## 26 SVMradial logistic 1.00000000000000
## 27 SVMradial NB 0.00000002169547
## 28 SVMradial rf 0.28990173776701
Error de Test
predicciones <- extractPrediction(
models = modelos,
testX = datos_test_prep %>% select(-Churn),
testY = datos_test_prep$Churn
)
predicciones %>% head()
## obs pred model dataType object
## 1 No Yes knn Training KNN
## 2 No No knn Training KNN
## 3 Yes No knn Training KNN
## 4 No No knn Training KNN
## 5 Yes Yes knn Training KNN
## 6 Yes No knn Training KNN
metricas_predicciones <- predicciones %>%
mutate(acierto = ifelse(obs == pred, TRUE, FALSE)) %>%
group_by(object, dataType) %>%
summarise(accuracy = mean(acierto))
## `summarise()` has grouped output by 'object'. You can override using the
## `.groups` argument.
metricas_predicciones %>%
spread(key = dataType, value = accuracy) %>%
arrange(desc(Test))
## # A tibble: 8 × 3
## # Groups: object [8]
## object Test Training
## <chr> <dbl> <dbl>
## 1 logistic 0.806 0.795
## 2 boosting 0.804 0.804
## 3 arbol 0.801 0.800
## 4 rf 0.8 0.817
## 5 SVMradial 0.799 0.796
## 6 LDA 0.797 0.794
## 7 KNN 0.755 0.796
## 8 NB 0.705 0.709
Analisis grafico de comparación de error de test y el traing
ggplot(data = metricas_predicciones,
aes(x = reorder(object, accuracy), y = accuracy,
color = dataType, label = round(accuracy, 2))) +
geom_point(size = 8) +
scale_color_manual(values = c("orangered2", "gray50")) +
geom_text(color = "white", size = 3) +
scale_y_continuous(limits = c(0, 1)) +
# Accuracy basal
geom_hline(yintercept = 0.62, linetype = "dashed") +
annotate(geom = "text", y = 0.66, x = 8.5, label = "Accuracy basal") +
coord_flip() +
labs(title = "Accuracy de entrenamiento y test",
x = "modelo") +
theme_bw() +
theme(legend.position = "bottom")
Del análisis anterior, se obta por el modelo RF del algoritmo c50