Modelo Churn

Se Cargan las librerias

Paso 0 Carga de Datos

datos_modelo <- read.csv("C:/Users/Osvaldo Mora Mora/Documents/Modelo Churn/Modelo Churn publicación Rpubs/Telco-Customer-Churn.csv", header = T, sep = ";")

Paso 1 Analisis Exploratorio de variables

as.data.frame(sort(names(datos_modelo)))

##    sort(names(datos_modelo))
## 1                      Churn
## 2                   Contract
## 3                 customerID
## 4                 Dependents
## 5           DeviceProtection
## 6                     gender
## 7            InternetService
## 8             MonthlyCharges
## 9              MultipleLines
## 10              OnlineBackup
## 11            OnlineSecurity
## 12          PaperlessBilling
## 13                   Partner
## 14             PaymentMethod
## 15              PhoneService
## 16             SeniorCitizen
## 17           StreamingMovies
## 18               StreamingTV
## 19               TechSupport
## 20                    tenure
## 21              TotalCharges

str(datos_modelo)

## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...

glimpse(datos_modelo)

## Rows: 7,043
## Columns: 21
## $ customerID       <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW…
## $ gender           <chr> "Female", "Male", "Male", "Male", "Female", "Female",…
## $ SeniorCitizen    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Partner          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes…
## $ Dependents       <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"…
## $ tenure           <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 2…
## $ PhoneService     <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ MultipleLines    <chr> "No phone service", "No", "No", "No phone service", "…
## $ InternetService  <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber opt…
## $ OnlineSecurity   <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "…
## $ OnlineBackup     <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "N…
## $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Y…
## $ TechSupport      <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes…
## $ StreamingTV      <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Ye…
## $ StreamingMovies  <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes…
## $ Contract         <chr> "Month-to-month", "One year", "Month-to-month", "One …
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", …
## $ PaymentMethod    <chr> "Electronic check", "Mailed check", "Mailed check", "…
## $ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.7…
## $ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949…
## $ Churn            <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "Y…

# Número de observaciones del set de datos
nrow(datos_modelo)

## [1] 7043

# Detección si hay alguna fila incompleta
any(!complete.cases(datos_modelo))

## [1] TRUE

# Número de datos ausentes por variable
map_dbl(datos_modelo, .f = function(x){sum(is.na(x))})

##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0

##Como el obetivo es generar un modelo, se prccede a omitir los Na Dejando claro que existen metodos para el tratamiento de los Na

Se procede a trabajar en el formato de algunas variables para facilitar su desarrollo y manipulación al momento que se desarrolla el modelo.

datos_modelo <- na.omit(datos_modelo)
datos_modelo$MultipleLines <- gsub(" ", "_", x= datos_modelo$MultipleLines)
datos_modelo$PaymentMethod <- gsub(" ", "_", x= datos_modelo$PaymentMethod)
datos_modelo$OnlineSecurity <- gsub(" ", "_", x= datos_modelo$OnlineSecurity)
datos_modelo$OnlineBackup <- gsub(" ", "_", x= datos_modelo$OnlineBackup)
datos_modelo$DeviceProtection <- gsub(" ", "_", x= datos_modelo$DeviceProtection)
datos_modelo$TechSupport <- gsub(" ", "_", x= datos_modelo$TechSupport)
datos_modelo$StreamingTV <- gsub(" ", "_", x= datos_modelo$StreamingTV)
datos_modelo$StreamingMovies <- gsub(" ", "_", x= datos_modelo$StreamingMovies)
datos_modelo$Contract <- gsub(" ", "_", x= datos_modelo$Contract)
datos_modelo$Contract <- gsub("-", "_", x= datos_modelo$Contract)
datos_modelo$PaperlessBilling <- gsub(" ", "_", x= datos_modelo$PaperlessBilling)


## SE pasan las variables en factor

datos_modelo$gender <- as.factor(datos_modelo$gender)
datos_modelo$Partner <- as.factor(datos_modelo$Partner)
datos_modelo$PhoneService <- as.factor(datos_modelo$PhoneService)
datos_modelo$MultipleLines <-  as.factor(datos_modelo$MultipleLines)
datos_modelo$InternetService <- as.factor(datos_modelo$InternetService)
datos_modelo$OnlineSecurity <- as.factor(datos_modelo$OnlineSecurity)
datos_modelo$OnlineBackup <- as.factor(datos_modelo$OnlineBackup)
datos_modelo$DeviceProtection <- as.factor(datos_modelo$DeviceProtection)
datos_modelo$TechSupport <- as.factor(datos_modelo$TechSupport)
datos_modelo$StreamingTV <- as.factor(datos_modelo$StreamingTV)
datos_modelo$StreamingMovies <- as.factor(datos_modelo$StreamingMovies)
datos_modelo$Contract <- as.factor(datos_modelo$Contract)
datos_modelo$PaperlessBilling <- as.factor(datos_modelo$PaperlessBilling)
datos_modelo$PaymentMethod <- as.factor(datos_modelo$PaymentMethod)

#Proceso de Análisis discritivo de variables Distribución de la variable respuesta u Objetivo

##Distribución de la variable a prediccir###
##Analisis del problema de desvalanceo##
ggplot(data = datos_modelo,
mapping = aes(x =Churn, fill = Churn)) +
geom_bar() + scale_fill_manual(values = c("cadetblue2","brown1"),
name = "Estado Actual") +
labs(x = "", y = "Cantidad de individuos",
title = "Distribución de la variable Objetivo") +
theme_minimal() +
theme(legend.position = "bottom")

##Analisis de variables Númericas

datoscont <- datos_modelo %>% select(Churn, SeniorCitizen, tenure, MonthlyCharges, TotalCharges)


Analisiscont<-function(x) {
  nombres <- as.data.frame(colnames(datoscont))
  nombrest <- t(nombres) 
  
  p1 <- ggplot(data = datos_modelo, aes(x = datoscont[,i], fill =Churn)) +
    geom_density(alpha = 0.5) +
    labs(title=paste("Densidad variable",nombrest[,i]),x=nombrest[,i], y = "Densidad") +
    scale_fill_manual(values = c("gray50", "#1874CD")) +
    geom_rug(aes(color = Churn), alpha = 0.5) +
    scale_color_manual(values = c("gray50", "#1874CD")) +
    theme_bw()
  
  p2 <- ggplot(data = datos_modelo, aes(x = Churn, y = datoscont[,i], color = Churn)) +
    geom_boxplot(outlier.shape = NA) +
    labs(title=paste("Boxplot Variable",nombrest[,i]),x=nombrest[,i], y = "Boxplot") +
    geom_jitter(alpha = 0.3, width = 0.15) +
    scale_color_manual(values = c("gray50", "#1874CD", "beige")) +
    theme_bw()
  final_plot <- ggarrange(p1, p2, legend = "top")
  final_plot <- annotate_figure(final_plot, top = text_grob("Grafico densidad y boxplot", size = 15))
  final_plot

  
datoscont %>% filter(!is.na(datoscont[,i])) %>% group_by(Churn)
  
  x <- subset(datoscont, Churn == "Yes")
  x <- na.omit(x)
  SI <- x %>% summarise(media = mean(x[,i]),
                        mediana = median(x[,i]),
                        min = min(x[,i]),
                        max = max(x[,i]))
  
  y <- subset(datoscont, Churn == "No")
  y <- na.omit(y)
  NO <- y %>% summarise(media = mean(y[,i]),
                        mediana = median(y[,i]),
                        min = min(y[,i]),
                        max = max(y[,i]))
  
  
  final = as.data.frame(rbind(SI,NO))
  rownames(final) <- c("Yes","No")
  
  print(final)
  
  boxplot(datoscont[,i], main=paste("Boxplot Variable",nombrest[,i]))
  plot(final_plot)
}

inicio <- 2
fin <-dim(datoscont)[2]

for(i in inicio:fin) 
{
  Analisiscont(datoscont[,i])                      
}

##         media mediana min max
## Yes 0.2546816       0   0   1
## No  0.1289948       0   0   1

##        media mediana min max
## Yes 17.97913      10   1  72
## No  37.65001      38   1  72

##        media mediana   min    max
## Yes 74.44133   79.65 18.85 118.35
## No  61.30741   64.45 18.25 118.75

##        media mediana   min     max
## Yes 1531.796  703.55 18.85 8684.80
## No  2555.344 1683.60 18.80 8672.45

#Analisis de variables cualitativas

datoscat <- select(datos_modelo, -customerID, -SeniorCitizen, -tenure, -MonthlyCharges, -TotalCharges)
predictora <- select(datos_modelo, Churn)
datoscat <- select(datoscat, -Churn)
datoscat <- cbind(predictora, datoscat)


Analisiscat<-function(x) {
  
  nombres <- as.data.frame(colnames(datoscat))
  nombrest <- t(nombres)
  g <- ggplot(data = datoscat, aes(x = datoscat[,i], y = ..count.., fill = Churn)) +
    geom_bar() +
    labs(title=paste("Grafico de barras variable",nombrest[,i]),x=nombrest[,i], y = "Cantidad") +
    scale_fill_manual(values = c("gray50", "#1874CD")) +
    theme_bw() +
    theme(legend.position = "bottom")
  
  # Tabla de frecuencias relativas para la variable
  c <- table(datoscat[,i], datos_modelo$Churn) #INCLUIR PARA VER LA CANTIDAD ABSOLUTA
  p <- prop.table(table(datoscat[,i], datos_modelo$Churn), margin = 1) %>% round(digits = 2)
  plot(g)
  print(c)
  print(p)
}

#For para corrida de las variables
inicio <- 2
fin <- dim(datoscat)[2]

for(i in inicio:fin) 
{
  Analisiscat(datoscat[,i])                      
}

##         
##            No  Yes
##   Female 2544  939
##   Male   2619  930
##         
##            No  Yes
##   Female 0.73 0.27
##   Male   0.74 0.26

##      
##         No  Yes
##   No  2439 1200
##   Yes 2724  669
##      
##         No  Yes
##   No  0.67 0.33
##   Yes 0.80 0.20

##      
##         No  Yes
##   No  3390 1543
##   Yes 1773  326
##      
##         No  Yes
##   No  0.69 0.31
##   Yes 0.84 0.16

##      
##         No  Yes
##   No   510  170
##   Yes 4653 1699
##      
##         No  Yes
##   No  0.75 0.25
##   Yes 0.73 0.27

##                   
##                      No  Yes
##   No               2536  849
##   No_phone_service  510  170
##   Yes              2117  850
##                   
##                      No  Yes
##   No               0.75 0.25
##   No_phone_service 0.75 0.25
##   Yes              0.71 0.29

##              
##                 No  Yes
##   DSL         1957  459
##   Fiber optic 1799 1297
##   No          1407  113
##              
##                 No  Yes
##   DSL         0.81 0.19
##   Fiber optic 0.58 0.42
##   No          0.93 0.07

##                      
##                         No  Yes
##   No                  2036 1461
##   No_internet_service 1407  113
##   Yes                 1720  295
##                      
##                         No  Yes
##   No                  0.58 0.42
##   No_internet_service 0.93 0.07
##   Yes                 0.85 0.15

##                      
##                         No  Yes
##   No                  1854 1233
##   No_internet_service 1407  113
##   Yes                 1902  523
##                      
##                         No  Yes
##   No                  0.60 0.40
##   No_internet_service 0.93 0.07
##   Yes                 0.78 0.22

##                      
##                         No  Yes
##   No                  1883 1211
##   No_internet_service 1407  113
##   Yes                 1873  545
##                      
##                         No  Yes
##   No                  0.61 0.39
##   No_internet_service 0.93 0.07
##   Yes                 0.77 0.23

##                      
##                         No  Yes
##   No                  2026 1446
##   No_internet_service 1407  113
##   Yes                 1730  310
##                      
##                         No  Yes
##   No                  0.58 0.42
##   No_internet_service 0.93 0.07
##   Yes                 0.85 0.15

##                      
##                         No  Yes
##   No                  1867  942
##   No_internet_service 1407  113
##   Yes                 1889  814
##                      
##                         No  Yes
##   No                  0.66 0.34
##   No_internet_service 0.93 0.07
##   Yes                 0.70 0.30

##                      
##                         No  Yes
##   No                  1843  938
##   No_internet_service 1407  113
##   Yes                 1913  818
##                      
##                         No  Yes
##   No                  0.66 0.34
##   No_internet_service 0.93 0.07
##   Yes                 0.70 0.30

##                 
##                    No  Yes
##   Month_to_month 2220 1655
##   One_year       1306  166
##   Two_year       1637   48
##                 
##                    No  Yes
##   Month_to_month 0.57 0.43
##   One_year       0.89 0.11
##   Two_year       0.97 0.03

##      
##         No  Yes
##   No  2395  469
##   Yes 2768 1400
##      
##         No  Yes
##   No  0.84 0.16
##   Yes 0.66 0.34

##                            
##                               No  Yes
##   Bank_transfer_(automatic) 1284  258
##   Credit_card_(automatic)   1289  232
##   Electronic_check          1294 1071
##   Mailed_check              1296  308
##                            
##                               No  Yes
##   Bank_transfer_(automatic) 0.83 0.17
##   Credit_card_(automatic)   0.85 0.15
##   Electronic_check          0.55 0.45
##   Mailed_check              0.81 0.19

##Análisis de Importancia de variables

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

datos_rf <- datos_modelo %>%
            select(-customerID) %>%
            na.omit()
datos_rf <- map_if(.x = datos_rf, .p = is.character, .f = as.factor) %>%
            as.data.frame()
modelo_randforest <- randomForest(formula = Churn ~ . ,
                                  data = na.omit(datos_rf),
                                  mtry = 5,
                                  importance = TRUE, 
                                  ntree = 1000) 
importancia <- as.data.frame(modelo_randforest$importance)
importancia <- rownames_to_column(importancia,var = "variable")

p1 <- ggplot(data = importancia, aes(x = reorder(variable, MeanDecreaseAccuracy),
                               y = MeanDecreaseAccuracy,
                               fill = MeanDecreaseAccuracy)) +
      labs(x = "variable", title = "Reducción de Accuracy") +
      geom_col() +
      coord_flip() +
      theme_bw() +
      theme(legend.position = "bottom")

p2 <- ggplot(data = importancia, aes(x = reorder(variable, MeanDecreaseGini),
                               y = MeanDecreaseGini,
                               fill = MeanDecreaseGini)) +
      labs(x = "variable", title = "Reducción de pureza (Gini)") +
      geom_col() +
      coord_flip() +
      theme_bw() +
      theme(legend.position = "bottom")
ggarrange(p1, p2)

Conclusión: Del análisis anterior, se deriva las siguientes variables para modelar:

1- TotalCharges 2- MonthlyCharges 3- tenure 4- Contract 5- PaymentMethod 6- OnlineSecurity 7- TechSupport 8- InternetService

#Modelamiento

base_final_modelo <- select(datos_modelo, 
                            Churn, 
                            TotalCharges, 
                            tenure, 
                            Contract, 
                            PaymentMethod, 
                            OnlineSecurity,
                            TechSupport, 
                            InternetService
                            )



set.seed(123)
# Se crean los índices de las observaciones de entrenamiento
train <- createDataPartition(y = base_final_modelo$Churn, p = 0.8, list = FALSE, times = 1)
datos_train <- base_final_modelo[train, ]
datos_test  <- base_final_modelo[-train, ]

prop.table(table(datos_train$Churn))

## 
##       No      Yes 
## 0.734139 0.265861

prop.table(table(datos_test$Churn))

## 
##        No       Yes 
## 0.7345196 0.2654804

##Transformación Dummy
##Se usara librería recipes
library(recipes)

## Warning: package 'recipes' was built under R version 4.2.1

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stringr':
## 
##     fixed

## The following object is masked from 'package:stats':
## 
##     step

objeto_recipe <- recipe(formula = Churn ~.,
                        data =  datos_train)
objeto_recipe

## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          7

objeto_recipe <- objeto_recipe %>% step_dummy(all_nominal(), -all_outcomes())
trained_recipe <- prep(objeto_recipe, training = datos_train)
trained_recipe

## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          7
## 
## Training data contained 5627 data points and no missing data.
## 
## Operations:
## 
## Dummy variables from Contract, PaymentMethod, OnlineSecurity, TechSupport, Int... [trained]

# Se aplican las transformaciones al conjunto de entrenamiento y de test
datos_train_prep <- bake(trained_recipe, new_data = datos_train)
datos_test_prep  <- bake(trained_recipe, new_data = datos_test)

glimpse(datos_train_prep)

## Rows: 5,627
## Columns: 14
## $ TotalCharges                          <dbl> 29.85, 1889.50, 108.15, 1840.75,…
## $ tenure                                <int> 1, 34, 2, 45, 2, 8, 10, 13, 58, …
## $ Churn                                 <fct> No, No, Yes, No, Yes, Yes, No, N…
## $ Contract_One_year                     <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ Contract_Two_year                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ PaymentMethod_Credit_card_.automatic. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,…
## $ PaymentMethod_Electronic_check        <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,…
## $ PaymentMethod_Mailed_check            <dbl> 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,…
## $ OnlineSecurity_No_internet_service    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ OnlineSecurity_Yes                    <dbl> 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,…
## $ TechSupport_No_internet_service       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ TechSupport_Yes                       <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,…
## $ InternetService_Fiber.optic           <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,…
## $ InternetService_No                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

Se inicia proceso de modelos Modelo KNN

##Se Inicia el proceso de modelado

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- data.frame(k = c(1, 2, 5, 10, 15, 20, 30, 50))

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros)) 
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = TRUE)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_knn <- train(Churn ~ ., data = datos_train_prep,
                    method = "knn",
                    tuneGrid = hiperparametros,
                    metric = "Accuracy",
                    trControl = control_train)
modelo_knn

## k-Nearest Neighbors 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.7094338  0.2627058
##    2  0.7091177  0.2605544
##    5  0.7520541  0.3116591
##   10  0.7547546  0.2973680
##   15  0.7541859  0.2728730
##   20  0.7534402  0.2586785
##   30  0.7516267  0.2153460
##   50  0.7487468  0.1632502
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.

# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_knn, highlight = TRUE) +
  scale_x_continuous(breaks = hiperparametros$k) +
  labs(title = "Evolución del accuracy del modelo KNN", x = "K") +
  theme_bw()

Modelo Naive Bayes

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- data.frame(usekernel = FALSE, fL = 0 , adjust = 0)

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = F)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_nb <- train(Churn ~ ., data = datos_train_prep,
                   method = "nb",
                   tuneGrid = hiperparametros,
                   metric = "Accuracy",
                   trControl = control_train)
modelo_nb

## Naive Bayes 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7083022  0.3974846
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'usekernel' was held constant at a value of FALSE
## Tuning
##  parameter 'adjust' was held constant at a value of 0

Modelo: Regresión Logistica

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- data.frame(parameter = "none")

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = TRUE)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_logistic <- train(Churn ~ ., data = datos_train_prep,
                         method = "glm",
                         tuneGrid = hiperparametros,
                         metric = "Accuracy",
                         trControl = control_train,
                         family = "binomial")
modelo_logistic

## Generalized Linear Model 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.7944944  0.441462

summary(modelo_logistic$finalModel)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6287  -0.6832  -0.2974   0.7882   3.3504  
## 
## Coefficients: (2 not defined because of singularities)
##                                          Estimate  Std. Error z value
## (Intercept)                            0.00687108  0.12639968   0.054
## TotalCharges                           0.00040599  0.00007141   5.685
## tenure                                -0.06360217  0.00672134  -9.463
## Contract_One_year                     -0.71615138  0.11729818  -6.105
## Contract_Two_year                     -1.51183448  0.19640933  -7.697
## PaymentMethod_Credit_card_.automatic.  0.01754077  0.12501386   0.140
## PaymentMethod_Electronic_check         0.36586878  0.10339514   3.539
## PaymentMethod_Mailed_check            -0.19157921  0.12571149  -1.524
## OnlineSecurity_No_internet_service    -1.09871254  0.14234678  -7.719
## OnlineSecurity_Yes                    -0.48222793  0.09307387  -5.181
## TechSupport_No_internet_service                NA          NA      NA
## TechSupport_Yes                       -0.35997033  0.09407138  -3.827
## InternetService_Fiber.optic            0.67212528  0.09967481   6.743
## InternetService_No                             NA          NA      NA
##                                                   Pr(>|z|)    
## (Intercept)                                       0.956648    
## TotalCharges                            0.0000000130869939 ***
## tenure                                < 0.0000000000000002 ***
## Contract_One_year                       0.0000000010254845 ***
## Contract_Two_year                       0.0000000000000139 ***
## PaymentMethod_Credit_card_.automatic.             0.888415    
## PaymentMethod_Electronic_check                    0.000402 ***
## PaymentMethod_Mailed_check                        0.127519    
## OnlineSecurity_No_internet_service      0.0000000000000118 ***
## OnlineSecurity_Yes                      0.0000002205442449 ***
## TechSupport_No_internet_service                         NA    
## TechSupport_Yes                                   0.000130 ***
## InternetService_Fiber.optic             0.0000000000154956 ***
## InternetService_No                                      NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6517.2  on 5626  degrees of freedom
## Residual deviance: 4748.3  on 5615  degrees of freedom
## AIC: 4772.3
## 
## Number of Fisher Scoring iterations: 6

Modelo LDA

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- data.frame(parameter = "none")

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = TRUE)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_lda <- train(Churn ~ ., data = datos_train_prep,
                    method = "lda",
                    tuneGrid = hiperparametros,
                    metric = "Accuracy",
                    trControl = control_train)
modelo_lda

## Linear Discriminant Analysis 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.793572  0.4482773

modelo_lda$finalModel

## Call:
## lda(x, grouping = y)
## 
## Prior probabilities of groups:
##       No      Yes 
## 0.734139 0.265861 
## 
## Group means:
##     TotalCharges   tenure Contract_One_year Contract_Two_year
## No      2562.370 37.67901        0.25006052        0.31905108
## Yes     1536.372 17.97460        0.08957219        0.02540107
##     PaymentMethod_Credit_card_.automatic. PaymentMethod_Electronic_check
## No                              0.2447349                      0.2500605
## Yes                             0.1303476                      0.5661765
##     PaymentMethod_Mailed_check OnlineSecurity_No_internet_service
## No                   0.2532075                         0.27184701
## Yes                  0.1624332                         0.05949198
##     OnlineSecurity_Yes TechSupport_No_internet_service TechSupport_Yes
## No           0.3326071                      0.27184701       0.3292181
## Yes          0.1590909                      0.05949198       0.1684492
##     InternetService_Fiber.optic InternetService_No
## No                    0.3524570         0.27184701
## Yes                   0.6938503         0.05949198
## 
## Coefficients of linear discriminants:
##                                                 LD1
## TotalCharges                          -0.0001259866
## tenure                                -0.0135844702
## Contract_One_year                     -0.5702482927
## Contract_Two_year                     -0.3970764027
## PaymentMethod_Credit_card_.automatic.  0.0186664183
## PaymentMethod_Electronic_check         0.4103846868
## PaymentMethod_Mailed_check            -0.1556114816
## OnlineSecurity_No_internet_service    -0.3123079295
## OnlineSecurity_Yes                    -0.3542029605
## TechSupport_No_internet_service       -0.3123079295
## TechSupport_Yes                       -0.2445416912
## InternetService_Fiber.optic            0.9512969841
## InternetService_No                    -0.3123079295

Modelo: Arbol de clasificación simple

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- data.frame(parameter = "none")

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = TRUE)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_C50Tree <- train(Churn~ ., data = datos_train_prep,
                    method = "C5.0Tree",
                    tuneGrid = hiperparametros,
                    metric = "Accuracy",
                    trControl = control_train)
modelo_C50Tree

## Single C5.0 Tree 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7918968  0.4493307

summary(modelo_C50Tree$finalModel)

## 
## Call:
## C50:::C5.0.default(x = x, y = y, weights = wts)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Oct 27 14:18:33 2022
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 5627 cases (14 attributes) from undefined.data
## 
## Decision tree:
## 
## Contract_Two_year > 0: No (1356/38)
## Contract_Two_year <= 0:
## :...Contract_One_year > 0: No (1167/134)
##     Contract_One_year <= 0:
##     :...InternetService_Fiber.optic <= 0:
##         :...tenure > 5: No (800/143)
##         :   tenure <= 5:
##         :   :...OnlineSecurity_No_internet_service > 0: No (222/59)
##         :       OnlineSecurity_No_internet_service <= 0:
##         :       :...TechSupport_Yes <= 0: Yes (302/136)
##         :           TechSupport_Yes > 0: No (71/27)
##         InternetService_Fiber.optic > 0:
##         :...tenure <= 15: Yes (833/260)
##             tenure > 15:
##             :...tenure > 54: No (143/32)
##                 tenure <= 54:
##                 :...PaymentMethod_Electronic_check <= 0: No (331/116)
##                     PaymentMethod_Electronic_check > 0:
##                     :...TechSupport_Yes <= 0: Yes (324/148)
##                         TechSupport_Yes > 0: No (78/32)
## 
## 
## Evaluation on training data (5627 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      11 1125(20.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3587   544    (a): class No
##     581   915    (b): class Yes
## 
## 
##  Attribute usage:
## 
##  100.00% Contract_Two_year
##   75.90% Contract_One_year
##   55.16% tenure
##   55.16% InternetService_Fiber.optic
##   13.77% TechSupport_Yes
##   13.03% PaymentMethod_Electronic_check
##   10.57% OnlineSecurity_No_internet_service
## 
## 
## Time: 0.0 secs

Modelo: Ramdom Forest

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- expand.grid(mtry = c(3, 4, 5, 7),
                               min.node.size = c(2, 3, 4, 5, 10, 15, 20, 30),
                               splitrule = "gini")

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = TRUE)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_rf <- train(Churn ~ ., data = datos_train_prep,
                   method = "ranger",
                   tuneGrid = hiperparametros,
                   metric = "Accuracy",
                   trControl = control_train,
                   # Número de árboles ajustados
                   num.trees = 500)
modelo_rf

## Random Forest 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results across tuning parameters:
## 
##   mtry  min.node.size  Accuracy   Kappa    
##   3      2             0.7958807  0.4450180
##   3      3             0.7954187  0.4430749
##   3      4             0.7961637  0.4453089
##   3      5             0.7962350  0.4455455
##   3     10             0.7957022  0.4440247
##   3     15             0.7953476  0.4429570
##   3     20             0.7954888  0.4429158
##   3     30             0.7965906  0.4464387
##   4      2             0.7957727  0.4501614
##   4      3             0.7958792  0.4500688
##   4      4             0.7965551  0.4518373
##   4      5             0.7965901  0.4517354
##   4     10             0.7967332  0.4520130
##   4     15             0.7971947  0.4530559
##   4     20             0.7971590  0.4522873
##   4     30             0.7977997  0.4543360
##   5      2             0.7932485  0.4475451
##   5      3             0.7929997  0.4461445
##   5      4             0.7935686  0.4475773
##   5      5             0.7937462  0.4474147
##   5     10             0.7937817  0.4466109
##   5     15             0.7946703  0.4477403
##   5     20             0.7953818  0.4493334
##   5     30             0.7972309  0.4529959
##   7      2             0.7824795  0.4224128
##   7      3             0.7837946  0.4239638
##   7      4             0.7848242  0.4267782
##   7      5             0.7859975  0.4289385
##   7     10             0.7879509  0.4315076
##   7     15             0.7891610  0.4321659
##   7     20             0.7891954  0.4298566
##   7     30             0.7913283  0.4328507
## 
## Tuning parameter 'splitrule' was held constant at a value of gini
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 4, splitrule = gini
##  and min.node.size = 30.

modelo_rf$finalModel

## Ranger result
## 
## Call:
##  ranger::ranger(dependent.variable.name = ".outcome", data = x,      mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size,      splitrule = as.character(param$splitrule), write.forest = TRUE,      probability = classProbs, ...) 
## 
## Type:                             Classification 
## Number of trees:                  500 
## Sample size:                      5627 
## Number of independent variables:  13 
## Mtry:                             4 
## Target node size:                 30 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error:             20.12 %

# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_rf, highlight = TRUE) +
  scale_x_continuous(breaks = 1:30) +
  labs(title = "Evolución del accuracy del modelo Random Forest") +
  guides(color = guide_legend(title = "mtry"),
         shape = guide_legend(title = "mtry")) +
  theme_bw()

Modelo Gradient Boosting

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- expand.grid(interaction.depth = c(1, 2),
                               n.trees = c(500, 1000, 2000),
                               shrinkage = c(0.001, 0.01, 0.1),
                               n.minobsinnode = c(2, 5, 15))

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = TRUE)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_boost <- train(Churn ~ ., data = datos_train_prep,
                   method = "gbm",
                   tuneGrid = hiperparametros,
                   metric = "Accuracy",
                   trControl = control_train,
                   # Número de árboles ajustados
                   distribution = "adaboost",
                   verbose = FALSE)
modelo_boost

## Stochastic Gradient Boosting 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  n.trees  Accuracy   Kappa    
##   0.001      1                   2               500     0.7341395  0.0000000
##   0.001      1                   2              1000     0.7341395  0.0000000
##   0.001      1                   2              2000     0.7623250  0.1871962
##   0.001      1                   5               500     0.7341395  0.0000000
##   0.001      1                   5              1000     0.7341395  0.0000000
##   0.001      1                   5              2000     0.7621829  0.1864404
##   0.001      1                  15               500     0.7341395  0.0000000
##   0.001      1                  15              1000     0.7341395  0.0000000
##   0.001      1                  15              2000     0.7622184  0.1867043
##   0.001      2                   2               500     0.7341395  0.0000000
##   0.001      2                   2              1000     0.7341395  0.0000000
##   0.001      2                   2              2000     0.7847554  0.3378705
##   0.001      2                   5               500     0.7341395  0.0000000
##   0.001      2                   5              1000     0.7341395  0.0000000
##   0.001      2                   5              2000     0.7846841  0.3375264
##   0.001      2                  15               500     0.7341395  0.0000000
##   0.001      2                  15              1000     0.7341395  0.0000000
##   0.001      2                  15              2000     0.7848975  0.3384592
##   0.010      1                   2               500     0.7888076  0.3656672
##   0.010      1                   2              1000     0.7928607  0.4064255
##   0.010      1                   2              2000     0.7946712  0.4259312
##   0.010      1                   5               500     0.7887366  0.3653983
##   0.010      1                   5              1000     0.7926116  0.4057564
##   0.010      1                   5              2000     0.7945291  0.4255129
##   0.010      1                  15               500     0.7887364  0.3655223
##   0.010      1                  15              1000     0.7926822  0.4057301
##   0.010      1                  15              2000     0.7945295  0.4254822
##   0.010      2                   2               500     0.7949201  0.4228465
##   0.010      2                   2              1000     0.7952762  0.4348609
##   0.010      2                   2              2000     0.7955957  0.4409645
##   0.010      2                   5               500     0.7947064  0.4219485
##   0.010      2                   5              1000     0.7958452  0.4375691
##   0.010      2                   5              2000     0.7958797  0.4423048
##   0.010      2                  15               500     0.7950270  0.4224679
##   0.010      2                  15              1000     0.7957384  0.4378673
##   0.010      2                  15              2000     0.7958441  0.4425964
##   0.100      1                   2               500     0.7932150  0.4275242
##   0.100      1                   2              1000     0.7930707  0.4289401
##   0.100      1                   2              2000     0.7917553  0.4284967
##   0.100      1                   5               500     0.7950611  0.4331179
##   0.100      1                   5              1000     0.7929287  0.4295684
##   0.100      1                   5              2000     0.7925026  0.4304894
##   0.100      1                  15               500     0.7942814  0.4303992
##   0.100      1                  15              1000     0.7943871  0.4330570
##   0.100      1                  15              2000     0.7931058  0.4316006
##   0.100      2                   2               500     0.7921830  0.4355337
##   0.100      2                   2              1000     0.7900478  0.4330814
##   0.100      2                   2              2000     0.7866348  0.4247116
##   0.100      2                   5               500     0.7917204  0.4355998
##   0.100      2                   5              1000     0.7911854  0.4354312
##   0.100      2                   5              2000     0.7870610  0.4261621
##   0.100      2                  15               500     0.7922884  0.4360380
##   0.100      2                  15              1000     0.7892317  0.4310267
##   0.100      2                  15              2000     0.7874897  0.4263105
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 2000, interaction.depth =
##  2, shrinkage = 0.01 and n.minobsinnode = 5.

# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_boost, highlight = TRUE) +
  labs(title = "Evolución del accuracy del modelo Gradient Boosting") +
    guides(color = guide_legend(title = "shrinkage"),
           shape = guide_legend(title = "shrinkage")) +
  theme_bw() +
  theme(legend.position = "bottom")

Modelo: Soporte Vectorial

# HIPERPARÁMETROS, NÚMERO DE REPETICIONES Y SEMILLAS PARA CADA REPETICIÓN
#===============================================================================
particiones  <- 10
repeticiones <- 5

# Hiperparámetros
hiperparametros <- expand.grid(sigma = c(0.001, 0.01, 0.1, 0.5, 1),
                               C = c(1 , 20, 50, 100, 200, 500, 700))

set.seed(123)
seeds <- vector(mode = "list", length = (particiones * repeticiones) + 1)
for (i in 1:(particiones * repeticiones)) {
  seeds[[i]] <- sample.int(1000, nrow(hiperparametros))
}
seeds[[(particiones * repeticiones) + 1]] <- sample.int(1000, 1)

# DEFINICIÓN DEL ENTRENAMIENTO
#===============================================================================
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, seeds = seeds,
                              returnResamp = "final", verboseIter = FALSE,
                              allowParallel = TRUE)

# AJUSTE DEL MODELO
# ==============================================================================
set.seed(342)
modelo_svmrad <- train(Churn ~ ., data = datos_train_prep,
                   method = "svmRadial",
                   tuneGrid = hiperparametros,
                   metric = "Accuracy",
                   trControl = control_train)
modelo_svmrad

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 5627 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5064, 5064, 5064, 5064, 5065, 5065, ... 
## Resampling results across tuning parameters:
## 
##   sigma  C    Accuracy   Kappa    
##   0.001    1  0.7946376  0.4447396
##   0.001   20  0.7902657  0.4126974
##   0.001   50  0.7895897  0.3976770
##   0.001  100  0.7894830  0.3933122
##   0.001  200  0.7878133  0.3919951
##   0.001  500  0.7887724  0.3987108
##   0.001  700  0.7890920  0.4009527
##   0.010    1  0.7872785  0.3846160
##   0.010   20  0.7875637  0.3967441
##   0.010   50  0.7877770  0.3941641
##   0.010  100  0.7885587  0.4010363
##   0.010  200  0.7888432  0.4099250
##   0.010  500  0.7891272  0.4208492
##   0.010  700  0.7894121  0.4242435
##   0.100    1  0.7914398  0.4268918
##   0.100   20  0.7883799  0.4267513
##   0.100   50  0.7868873  0.4296732
##   0.100  100  0.7885222  0.4377427
##   0.100  200  0.7900494  0.4474032
##   0.100  500  0.7894821  0.4499308
##   0.100  700  0.7898729  0.4518908
##   0.500    1  0.7907981  0.4306511
##   0.500   20  0.7895893  0.4517241
##   0.500   50  0.7864257  0.4457784
##   0.500  100  0.7832271  0.4390163
##   0.500  200  0.7800997  0.4341689
##   0.500  500  0.7742723  0.4227378
##   0.500  700  0.7726727  0.4198318
##   1.000    1  0.7901578  0.4323429
##   1.000   20  0.7827299  0.4366312
##   1.000   50  0.7790345  0.4319732
##   1.000  100  0.7742013  0.4241068
##   1.000  200  0.7716069  0.4209782
##   1.000  500  0.7666675  0.4145062
##   1.000  700  0.7636809  0.4101711
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.001 and C = 1.

modelo_svmrad$finalModel

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.001 
## 
## Number of Support Vectors : 2928 
## 
## Objective Function Value : -2792.835 
## Training error : 0.204016

# REPRESENTACIÓN GRÁFICA
# ==============================================================================
ggplot(modelo_svmrad, highlight = TRUE) +
  labs(title = "Evolución del accuracy del modelo SVM Radial") +
  theme_bw()

Proceso de comparación de modelos Este proceso consiste en comparar los difeentes resultados de los modelos obtenidos para determinar cual de los modelos cumplen con los KPI´s desados.

modelos <- list(KNN = modelo_knn, NB = modelo_nb, logistic = modelo_logistic,
                LDA = modelo_lda, arbol = modelo_C50Tree, rf = modelo_rf,
                boosting = modelo_boost, SVMradial = modelo_svmrad)

resultados_resamples <- resamples(modelos)
resultados_resamples$values %>% head(10)

##       Resample KNN~Accuracy KNN~Kappa NB~Accuracy  NB~Kappa logistic~Accuracy
## 1  Fold01.Rep1    0.7531083 0.2848997   0.7015986 0.3915863         0.7904085
## 2  Fold01.Rep2    0.7508897 0.2974122   0.7241993 0.4243401         0.7864769
## 3  Fold01.Rep3    0.7513321 0.2780200   0.7069272 0.3868924         0.7904085
## 4  Fold01.Rep4    0.7531083 0.2950645   0.7122558 0.4011582         0.7868561
## 5  Fold01.Rep5    0.7313167 0.2182508   0.6814947 0.3443565         0.7811388
## 6  Fold02.Rep1    0.7371226 0.2650908   0.7069272 0.3826057         0.7779751
## 7  Fold02.Rep2    0.7602131 0.3185792   0.7069272 0.4054682         0.7921847
## 8  Fold02.Rep3    0.7429078 0.2918745   0.6879433 0.3653522         0.7996454
## 9  Fold02.Rep4    0.7566607 0.3117298   0.6820604 0.3593366         0.7655417
## 10 Fold02.Rep5    0.7375887 0.2788721   0.6666667 0.3377305         0.7677305
##    logistic~Kappa LDA~Accuracy LDA~Kappa arbol~Accuracy arbol~Kappa rf~Accuracy
## 1       0.4375434    0.7904085 0.4473872      0.8028419   0.4902068   0.7939609
## 2       0.4116518    0.7740214 0.3899762      0.7615658   0.3178747   0.7846975
## 3       0.4324888    0.7904085 0.4350274      0.7904085   0.4521811   0.7850799
## 4       0.4254516    0.7815275 0.4176009      0.7886323   0.4463378   0.7957371
## 5       0.4091896    0.7740214 0.4007925      0.7722420   0.3837825   0.7793594
## 6       0.3947434    0.7779751 0.4107378      0.7797513   0.4268120   0.7850799
## 7       0.4410654    0.7921847 0.4508691      0.7886323   0.4463378   0.7921847
## 8       0.4580612    0.7996454 0.4699601      0.7890071   0.4652964   0.7978723
## 9       0.3594001    0.7619893 0.3612781      0.7637655   0.3674531   0.7655417
## 10      0.3855289    0.7659574 0.3821948      0.7854610   0.4493610   0.7836879
##     rf~Kappa boosting~Accuracy boosting~Kappa SVMradial~Accuracy
## 1  0.4519578         0.7957371      0.4456716          0.7921847
## 2  0.4081121         0.7740214      0.3671283          0.7793594
## 3  0.4320954         0.7992895      0.4577541          0.7904085
## 4  0.4380799         0.7939609      0.4370485          0.7850799
## 5  0.4083909         0.7740214      0.3844203          0.7864769
## 6  0.4270708         0.7761989      0.3912782          0.7779751
## 7  0.4460106         0.7939609      0.4446032          0.8010657
## 8  0.4617239         0.7960993      0.4434720          0.8049645
## 9  0.3594001         0.7708703      0.3667815          0.7548845
## 10 0.4339585         0.7836879      0.4188121          0.7624113
##    SVMradial~Kappa
## 1        0.4484505
## 2        0.3948278
## 3        0.4324888
## 4        0.4219565
## 5        0.4222961
## 6        0.4028477
## 7        0.4637548
## 8        0.4783047
## 9        0.3363004
## 10       0.3616461

metricas_resamples <- resultados_resamples$values %>%
                         gather(key = "modelo", value = "valor", -Resample) %>%
                         separate(col = "modelo", into = c("modelo", "metrica"),
                                  sep = "~", remove = TRUE)
metricas_resamples %>% head()

##      Resample modelo  metrica     valor
## 1 Fold01.Rep1    KNN Accuracy 0.7531083
## 2 Fold01.Rep2    KNN Accuracy 0.7508897
## 3 Fold01.Rep3    KNN Accuracy 0.7513321
## 4 Fold01.Rep4    KNN Accuracy 0.7531083
## 5 Fold01.Rep5    KNN Accuracy 0.7313167
## 6 Fold02.Rep1    KNN Accuracy 0.7371226

Accuracy y Kappa promedio de cada modelo

metricas_resamples %>% 
  group_by(modelo, metrica) %>% 
  summarise(media = mean(valor)) %>%
  spread(key = metrica, value = media) %>%
  arrange(desc(Accuracy))

## `summarise()` has grouped output by 'modelo'. You can override using the
## `.groups` argument.

## # A tibble: 8 × 3
## # Groups:   modelo [8]
##   modelo    Accuracy Kappa
##   <chr>        <dbl> <dbl>
## 1 rf           0.798 0.454
## 2 boosting     0.796 0.442
## 3 SVMradial    0.795 0.445
## 4 logistic     0.794 0.441
## 5 LDA          0.794 0.448
## 6 arbol        0.792 0.449
## 7 KNN          0.755 0.297
## 8 NB           0.708 0.397

Representación Gráfica

metricas_resamples %>%
  filter(metrica == "Accuracy") %>%
  group_by(modelo) %>% 
  summarise(media = mean(valor)) %>%
  ggplot(aes(x = reorder(modelo, media), y = media, label = round(media, 2))) +
    geom_segment(aes(x = reorder(modelo, media), y = 0,
                     xend = modelo, yend = media),
                     color = "grey50") +
    geom_point(size = 7, color = "firebrick") +
    geom_text(color = "white", size = 2.5) +
    scale_y_continuous(limits = c(0, 1)) +
    # Accuracy basal
    geom_hline(yintercept = 0.62, linetype = "dashed") +
    annotate(geom = "text", y = 0.72, x = 8.5, label = "Accuracy basal") +
    labs(title = "Validación: Accuracy medio repeated-CV",
         subtitle = "Modelos ordenados por media",
         x = "modelo") +
    coord_flip() +
    theme_bw()

metricas_resamples %>% filter(metrica == "Accuracy") %>%
  group_by(modelo) %>% 
  mutate(media = mean(valor)) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(modelo, media), y = valor, color = modelo)) +
    geom_boxplot(alpha = 0.6, outlier.shape = NA) +
    geom_jitter(width = 0.1, alpha = 0.6) +
    scale_y_continuous(limits = c(0, 1)) +
    # Accuracy basal
    geom_hline(yintercept = 0.62, linetype = "dashed") +
    annotate(geom = "text", y = 0.65, x = 8.5, label = "Accuracy basal") +
    theme_bw() +
    labs(title = "Validación: Accuracy medio repeated-CV",
         subtitle = "Modelos ordenados por media") +
    coord_flip() +
    theme(legend.position = "none")

# Comparaciones múltiples con un test suma de rangos de Wilcoxon
# ==============================================================================

metricas_accuracy <- metricas_resamples %>% filter(metrica == "Accuracy")
comparaciones  <- pairwise.wilcox.test(x = metricas_accuracy$valor, 
                                        g = metricas_accuracy$modelo,
                                        paired = TRUE,
                                        p.adjust.method = "holm")

# Se almacenan los p_values en forma de dataframe
comparaciones <- comparaciones$p.value %>%
  as.data.frame() %>%
  rownames_to_column(var = "modeloA") %>%
  gather(key = "modeloB", value = "p_value", -modeloA) %>%
  na.omit() %>%
  arrange(modeloA) 


comparaciones

##      modeloA  modeloB          p_value
## 1   boosting    arbol 0.02764486331395
## 2        KNN    arbol 0.00000002169547
## 3        KNN boosting 0.00000002169547
## 4        LDA    arbol 1.00000000000000
## 5        LDA boosting 0.86201775379628
## 6        LDA      KNN 0.00000002169547
## 7   logistic    arbol 0.86201775379628
## 8   logistic boosting 1.00000000000000
## 9   logistic      KNN 0.00000002169547
## 10  logistic      LDA 1.00000000000000
## 11        NB    arbol 0.00000002169547
## 12        NB boosting 0.00000002169547
## 13        NB      KNN 0.00000002169547
## 14        NB      LDA 0.00000002169547
## 15        NB logistic 0.00000002169547
## 16        rf    arbol 0.00053500316876
## 17        rf boosting 0.86201775379628
## 18        rf      KNN 0.00000002169547
## 19        rf      LDA 0.04574417935984
## 20        rf logistic 0.30426220952912
## 21        rf       NB 0.00000002169547
## 22 SVMradial    arbol 0.88300816132906
## 23 SVMradial boosting 1.00000000000000
## 24 SVMradial      KNN 0.00000002169547
## 25 SVMradial      LDA 0.88300816132906
## 26 SVMradial logistic 1.00000000000000
## 27 SVMradial       NB 0.00000002169547
## 28 SVMradial       rf 0.28990173776701

Error de Test

predicciones <- extractPrediction(
                  models = modelos,
                  testX = datos_test_prep %>% select(-Churn),
                  testY = datos_test_prep$Churn
                )
predicciones %>% head()

##   obs pred model dataType object
## 1  No  Yes   knn Training    KNN
## 2  No   No   knn Training    KNN
## 3 Yes   No   knn Training    KNN
## 4  No   No   knn Training    KNN
## 5 Yes  Yes   knn Training    KNN
## 6 Yes   No   knn Training    KNN

metricas_predicciones <- predicciones %>%
                         mutate(acierto = ifelse(obs == pred, TRUE, FALSE)) %>%
                         group_by(object, dataType) %>%
                         summarise(accuracy = mean(acierto))

## `summarise()` has grouped output by 'object'. You can override using the
## `.groups` argument.

metricas_predicciones %>%
  spread(key = dataType, value = accuracy) %>%
  arrange(desc(Test))

## # A tibble: 8 × 3
## # Groups:   object [8]
##   object     Test Training
##   <chr>     <dbl>    <dbl>
## 1 logistic  0.806    0.795
## 2 boosting  0.804    0.804
## 3 arbol     0.801    0.800
## 4 rf        0.8      0.817
## 5 SVMradial 0.799    0.796
## 6 LDA       0.797    0.794
## 7 KNN       0.755    0.796
## 8 NB        0.705    0.709

Analisis grafico de comparación de error de test y el traing

ggplot(data = metricas_predicciones,
       aes(x = reorder(object, accuracy), y = accuracy,
           color = dataType, label = round(accuracy, 2))) +
  geom_point(size = 8) +
  scale_color_manual(values = c("orangered2", "gray50")) +
  geom_text(color = "white", size = 3) +
  scale_y_continuous(limits = c(0, 1)) +
  # Accuracy basal
  geom_hline(yintercept = 0.62, linetype = "dashed") +
  annotate(geom = "text", y = 0.66, x = 8.5, label = "Accuracy basal") +
  coord_flip() +
  labs(title = "Accuracy de entrenamiento y test", 
       x = "modelo") +
  theme_bw() + 
  theme(legend.position = "bottom")

Del análisis anterior, se obta por el modelo RF del algoritmo c50

Modelo Churn

Osvaldo Andrey Mora Mora

2022-08-22