0 - Parámetros iniciales
#Desactivamos la notación científica
options(scipen=999)
1 - Configuración del entorno
1.1 - Cargamos las librerías a utilizar
#lista de paquetes que vamos a usar
paquetes <- c('data.table',#para leer y escribir datos de forma rapida
'dplyr',#para manipulación de datos
'tidyr',#para manipulación de datos
'ggplot2',#para gráficos
'randomForest',#para crear los modelos
'ROCR',#para evaluar modelos
'purrr',#para usar la función map que aplica la misma funciona a varios componentes de un dataframe
'smbinning',#para calcular la para importancia de las variables
'rpart',#para crear arboles de decisión
'rpart.plot',#para el gráfico del árbol
'tictoc',#para calcular el tiempo transcurrido entre dos acciones
'tidyverse',#para manipulación de datos
'visdat',#para una visión general del dataset
'dlookr',#visualización de missings
'inspectdf',#para un análisis expploratorio breve
'caret',#clasificación y regresión
'PerformanceAnalytics',#graficar correlaciones
'e1071',#para modelar SVM
'rgl',#para modelar SVM
'pROC'
)
#Crea un vector lógico con si están instalados o no
instalados <- paquetes %in% installed.packages()
#Si hay al menos uno no instalado los instala
if(sum(instalados == FALSE) > 0) {
install.packages(paquetes[!instalados])
}
lapply(paquetes,require,character.only = TRUE)
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
##
## [[8]]
## [1] TRUE
##
## [[9]]
## [1] TRUE
##
## [[10]]
## [1] TRUE
##
## [[11]]
## [1] TRUE
##
## [[12]]
## [1] TRUE
##
## [[13]]
## [1] TRUE
##
## [[14]]
## [1] TRUE
##
## [[15]]
## [1] TRUE
##
## [[16]]
## [1] TRUE
##
## [[17]]
## [1] TRUE
##
## [[18]]
## [1] TRUE
##
## [[19]]
## [1] TRUE
##
## [[20]]
## [1] TRUE
1.2 - Cargamos los datos
Primera visualización de datos antes de la importación
read_lines('C:/R/DS4B/Proyecto_entregable/Telco-Customer-Churn.csv', n_max=10)
## [1] "customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn"
## [2] "7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No"
## [3] "5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No"
## [4] "3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes"
## [5] "7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No"
## [6] "9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes"
## [7] "9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes"
## [8] "1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No"
## [9] "6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No"
## [10] "7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes"
Usamos fread de data.table para una lectura de datos mucho mas rápida
tic()
df <- fread('C:/R/DS4B/Proyecto_entregable/Telco-Customer-Churn.csv')
toc()
## 0.01 sec elapsed
2 - Análisis exploratorio
2.1 - Análisis exploratorio general y tipo de datos
as.data.frame(names(df))
## names(df)
## 1 customerID
## 2 gender
## 3 SeniorCitizen
## 4 Partner
## 5 Dependents
## 6 tenure
## 7 PhoneService
## 8 MultipleLines
## 9 InternetService
## 10 OnlineSecurity
## 11 OnlineBackup
## 12 DeviceProtection
## 13 TechSupport
## 14 StreamingTV
## 15 StreamingMovies
## 16 Contract
## 17 PaperlessBilling
## 18 PaymentMethod
## 19 MonthlyCharges
## 20 TotalCharges
## 21 Churn
A continuación se detallan las variables que contiene el dataset,
customerID.- Recoge el ID de cada cliente
Variables personales:
gender.- Indica si el cliente es hombre o mujer
SeniorCitizen.- Indica si el cliente es una persona mayor o no
Partner.- Indica si el cliente tiene pareja o no
Dependents.- Indica si el cliente tiene dependientes o no
Variable que indica antigüedad:
tenure.- Indica el nº de meses que el cliente ha permanecido en la cía, o que permanece
Variables asociadas al servicio suscrito:
PhoneService.- Indica si el cliente tiene servicio telefónico o no
MultipleLines.- Indica si el cliente tiene múltiples líneas o no
InternetService.- Indica el proveedor de servicios de internet del cliente
OnlineSecurity.- Indica si el cliente tiene seguridad en línea o no
OnlineBackup.- Indica si el cliente tiene respaldo en línea o no
DeviceProtection.- Indica si el cliente tiene protección del dispositivo o no
TechSupport.- Indica si el cliente tiene soporte técnico o no
StreamingTV.- Indica si el cliente tiene servicio de TV en streaming o no
StreamingMovies.- Indica si el cliente dispone del servicio de películas en streaming o no
Variables asociadas al tipo de contrato:
Contract.- Indica el plazo del contrato
PaperlessBilling.- Indica si el cliente dispone de facturación electrónica o no
PaymentMethod.- Indica el método de pago del cliente
Otras variables:
MonthlyCharges.- Indica el importe cobrado mensualmente al cliente
TotalCharges.- Indica la cantidad total cargada al cliente
Churn.- Indica si el cliente abandonó o no, será la variable a predecir
str(df)
## Classes 'data.table' and 'data.frame': 7043 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
## - attr(*, ".internal.selfref")=<externalptr>
glimpse(df)
## Rows: 7,043
## Columns: 21
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ gender <chr> "Female", "Male", "Male", "Male", "Female", "Femal...
## $ SeniorCitizen <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "...
## $ Dependents <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "...
## $ tenure <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49...
## $ PhoneService <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No...
## $ MultipleLines <chr> "No phone service", "No", "No", "No phone service"...
## $ InternetService <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber ...
## $ OnlineSecurity <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes"...
## $ OnlineBackup <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No",...
## $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No",...
## $ TechSupport <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "...
## $ StreamingTV <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", ...
## $ StreamingMovies <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "...
## $ Contract <chr> "Month-to-month", "One year", "Month-to-month", "O...
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No...
## $ PaymentMethod <chr> "Electronic check", "Mailed check", "Mailed check"...
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 2...
## $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1...
## $ Churn <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No",...
Este dataset contiene 7.043 filas, que corresponden a diferentes clientes de la cía, y 21 columnas, que son diferentes atributos del cliente. La columna que indica “Churn”, es nuestro target.
Conclusiones:
a_factores <- c('gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod')
2.2 - Calidad de datos: Estadísticos básicos
Hacemos un summary con lapply a las variables que no son character, que sale en formato de lista y se lee mejor. Previamente tenemos una visión general del tipo de variables del dataset, número de observaciones, y eventuales NA, de forma fácilmente interpretable
vis_dat(df, sort_type = FALSE)
son_character <- c(a_factores, 'customerID', 'Churn')
df %>%
select(-son_character) %>%
lapply(summary)
## $tenure
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 29.00 32.37 55.00 72.00
##
## $MonthlyCharges
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.25 35.50 70.35 64.76 89.85 118.75
##
## $TotalCharges
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.8 401.4 1397.5 2283.3 3794.7 8684.8 11
Vemos que hay algunas observaciones con antigüedad cero, vamos a contar las que hay
df %>% count(tenure == 0)
## tenure == 0 n
## 1: FALSE 7032
## 2: TRUE 11
Hay 11 valores de antigüedad igual a cero
Detectamos 11 nulos en la variable TotalCharges, que trataremos posteriormente. Por lo demás, no detectamos nada raro.
2.3 - Calidad de datos: Análisis de nulos
data.frame(colSums(is.na(df)))
## colSums.is.na.df..
## customerID 0
## gender 0
## SeniorCitizen 0
## Partner 0
## Dependents 0
## tenure 0
## PhoneService 0
## MultipleLines 0
## InternetService 0
## OnlineSecurity 0
## OnlineBackup 0
## DeviceProtection 0
## TechSupport 0
## StreamingTV 0
## StreamingMovies 0
## Contract 0
## PaperlessBilling 0
## PaymentMethod 0
## MonthlyCharges 0
## TotalCharges 11
## Churn 0
Verificamos que no hay nulos en el resto de variables
#para una visión general de los missings por variable
vis_miss(df)
Verificamos que el porcentaje de missings en el dataset es menor de un 0.1%
df %>% filter(is.na(TotalCharges))
## customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## 1: 4472-LVYGI Female 0 Yes Yes 0 No
## 2: 3115-CZMZD Male 0 No Yes 0 Yes
## 3: 5709-LVOEQ Female 0 Yes Yes 0 Yes
## 4: 4367-NUYAO Male 0 Yes Yes 0 Yes
## 5: 1371-DWPAZ Female 0 Yes Yes 0 No
## 6: 7644-OMVMY Male 0 Yes Yes 0 Yes
## 7: 3213-VVOLG Male 0 Yes Yes 0 Yes
## 8: 2520-SGTTA Female 0 Yes Yes 0 Yes
## 9: 2923-ARZLG Male 0 Yes Yes 0 Yes
## 10: 4075-WKNIU Female 0 Yes Yes 0 Yes
## 11: 2775-SEFEE Male 0 No Yes 0 Yes
## MultipleLines InternetService OnlineSecurity OnlineBackup
## 1: No phone service DSL Yes No
## 2: No No No internet service No internet service
## 3: No DSL Yes Yes
## 4: Yes No No internet service No internet service
## 5: No phone service DSL Yes Yes
## 6: No No No internet service No internet service
## 7: Yes No No internet service No internet service
## 8: No No No internet service No internet service
## 9: No No No internet service No internet service
## 10: Yes DSL No Yes
## 11: Yes DSL Yes Yes
## DeviceProtection TechSupport StreamingTV
## 1: Yes Yes Yes
## 2: No internet service No internet service No internet service
## 3: Yes No Yes
## 4: No internet service No internet service No internet service
## 5: Yes Yes Yes
## 6: No internet service No internet service No internet service
## 7: No internet service No internet service No internet service
## 8: No internet service No internet service No internet service
## 9: No internet service No internet service No internet service
## 10: Yes Yes Yes
## 11: No Yes No
## StreamingMovies Contract PaperlessBilling PaymentMethod
## 1: No Two year Yes Bank transfer (automatic)
## 2: No internet service Two year No Mailed check
## 3: Yes Two year No Mailed check
## 4: No internet service Two year No Mailed check
## 5: No Two year No Credit card (automatic)
## 6: No internet service Two year No Mailed check
## 7: No internet service Two year No Mailed check
## 8: No internet service Two year No Mailed check
## 9: No internet service One year Yes Mailed check
## 10: No Two year No Mailed check
## 11: No Two year Yes Bank transfer (automatic)
## MonthlyCharges TotalCharges Churn
## 1: 52.55 NA No
## 2: 20.25 NA No
## 3: 80.85 NA No
## 4: 25.75 NA No
## 5: 56.05 NA No
## 6: 19.85 NA No
## 7: 25.35 NA No
## 8: 20.00 NA No
## 9: 19.70 NA No
## 10: 73.35 NA No
## 11: 61.90 NA No
Comprobamos correlaciones de las variables numéricas
chart.Correlation(df[,c('TotalCharges', 'MonthlyCharges', 'tenure')], histogram = TRUE, pch = 15)
Conclusión:
2.4 - Calidad de datos: Análisis de ceros
Procedemos a un conteo de las variables de tipo character para comprobar la proporción de ceros que pudiera haber, y si algún nulo pudiera estar camuflado como cero
df %>%
select(a_factores) %>%
lapply(table)
## $gender
##
## Female Male
## 3488 3555
##
## $SeniorCitizen
##
## 0 1
## 5901 1142
##
## $Partner
##
## No Yes
## 3641 3402
##
## $Dependents
##
## No Yes
## 4933 2110
##
## $PhoneService
##
## No Yes
## 682 6361
##
## $MultipleLines
##
## No No phone service Yes
## 3390 682 2971
##
## $InternetService
##
## DSL Fiber optic No
## 2421 3096 1526
##
## $OnlineSecurity
##
## No No internet service Yes
## 3498 1526 2019
##
## $OnlineBackup
##
## No No internet service Yes
## 3088 1526 2429
##
## $DeviceProtection
##
## No No internet service Yes
## 3095 1526 2422
##
## $TechSupport
##
## No No internet service Yes
## 3473 1526 2044
##
## $StreamingTV
##
## No No internet service Yes
## 2810 1526 2707
##
## $StreamingMovies
##
## No No internet service Yes
## 2785 1526 2732
##
## $Contract
##
## Month-to-month One year Two year
## 3875 1473 1695
##
## $PaperlessBilling
##
## No Yes
## 2872 4171
##
## $PaymentMethod
##
## Bank transfer (automatic) Credit card (automatic) Electronic check
## 1544 1522 2365
## Mailed check
## 1612
Conclusiones:
2.5 - Calidad de datos: Análisis de atípicos
2.5.1 - Analizamos las que son de tipo numérico
out <- function(variable){
t(t(head(sort(variable,decreasing = T),20)))
}
lapply(df,function(x){
if(is.double(x)) out(x)
})
## $customerID
## NULL
##
## $gender
## NULL
##
## $SeniorCitizen
## NULL
##
## $Partner
## NULL
##
## $Dependents
## NULL
##
## $tenure
## NULL
##
## $PhoneService
## NULL
##
## $MultipleLines
## NULL
##
## $InternetService
## NULL
##
## $OnlineSecurity
## NULL
##
## $OnlineBackup
## NULL
##
## $DeviceProtection
## NULL
##
## $TechSupport
## NULL
##
## $StreamingTV
## NULL
##
## $StreamingMovies
## NULL
##
## $Contract
## NULL
##
## $PaperlessBilling
## NULL
##
## $PaymentMethod
## NULL
##
## $MonthlyCharges
## [,1]
## [1,] 118.75
## [2,] 118.65
## [3,] 118.60
## [4,] 118.60
## [5,] 118.35
## [6,] 118.20
## [7,] 117.80
## [8,] 117.60
## [9,] 117.50
## [10,] 117.45
## [11,] 117.35
## [12,] 117.20
## [13,] 117.15
## [14,] 116.95
## [15,] 116.85
## [16,] 116.80
## [17,] 116.75
## [18,] 116.60
## [19,] 116.60
## [20,] 116.55
##
## $TotalCharges
## [,1]
## [1,] 8684.80
## [2,] 8672.45
## [3,] 8670.10
## [4,] 8594.40
## [5,] 8564.75
## [6,] 8547.15
## [7,] 8543.25
## [8,] 8529.50
## [9,] 8496.70
## [10,] 8477.70
## [11,] 8477.60
## [12,] 8476.50
## [13,] 8468.20
## [14,] 8456.75
## [15,] 8443.70
## [16,] 8436.25
## [17,] 8425.30
## [18,] 8425.15
## [19,] 8424.90
## [20,] 8405.00
##
## $Churn
## NULL
No vemos nada raro, escalado normal en ambas variables
2.5.2 - Analizamos las que son de tipo integer
out <- function(variable){
t(t(table(variable)))
}
lapply(df,function(x){
if(is.integer(x)) out(x)
})
## $customerID
## NULL
##
## $gender
## NULL
##
## $SeniorCitizen
##
## variable [,1]
## 0 5901
## 1 1142
##
## $Partner
## NULL
##
## $Dependents
## NULL
##
## $tenure
##
## variable [,1]
## 0 11
## 1 613
## 2 238
## 3 200
## 4 176
## 5 133
## 6 110
## 7 131
## 8 123
## 9 119
## 10 116
## 11 99
## 12 117
## 13 109
## 14 76
## 15 99
## 16 80
## 17 87
## 18 97
## 19 73
## 20 71
## 21 63
## 22 90
## 23 85
## 24 94
## 25 79
## 26 79
## 27 72
## 28 57
## 29 72
## 30 72
## 31 65
## 32 69
## 33 64
## 34 65
## 35 88
## 36 50
## 37 65
## 38 59
## 39 56
## 40 64
## 41 70
## 42 65
## 43 65
## 44 51
## 45 61
## 46 74
## 47 68
## 48 64
## 49 66
## 50 68
## 51 68
## 52 80
## 53 70
## 54 68
## 55 64
## 56 80
## 57 65
## 58 67
## 59 60
## 60 76
## 61 76
## 62 70
## 63 72
## 64 80
## 65 76
## 66 89
## 67 98
## 68 100
## 69 95
## 70 119
## 71 170
## 72 362
##
## $PhoneService
## NULL
##
## $MultipleLines
## NULL
##
## $InternetService
## NULL
##
## $OnlineSecurity
## NULL
##
## $OnlineBackup
## NULL
##
## $DeviceProtection
## NULL
##
## $TechSupport
## NULL
##
## $StreamingTV
## NULL
##
## $StreamingMovies
## NULL
##
## $Contract
## NULL
##
## $PaperlessBilling
## NULL
##
## $PaymentMethod
## NULL
##
## $MonthlyCharges
## NULL
##
## $TotalCharges
## NULL
##
## $Churn
## NULL
Visualizamos la distribución de la variable tenure a través de un histograma
hist(df$tenure,breaks = 50)
df %>%
filter(tenure <= 3 | tenure >= 70) %>%
summarise(tot_extremos=n()) %>%
mutate(porcentaje_extremos = tot_extremos / nrow(df))
## tot_extremos porcentaje_extremos
## 1 1713 0.2432202
Del análisis de la antigüedad se desprende que la mayor parte de los clientes están concentrados en los extremos, tanto en valores de 1 mes, como en el valor máximo de 72, aprox un 24% Convendría encontrar alguna explicación a nivel de negocio, pero podría venir justificada la acumulación de clientes en el tramo bajo porque se llevó a cabo una campaña agresiva de captación de clientes, con resultado positivo. La acumulación de clientes en el tramo alto, podría ser debida a un cambio en el software que se produjo hace 72 meses (6 años), con lo que las antigüedades superiores, se concentran en ese valor. En todo caso, como vamos a discretizar esa variable, con lo que solucionaríamos el eventual problema
2.6 - Análisis longitudinal
No procede este tipo de análisis
2.7 - Análisis de coherencia
longi <- df %>%
select(-son_character) %>%
summarise_all(mean, na.rm=TRUE) %>% #calcular la media de cada variable
t() %>% #trasponerlo para tenerlo en una sola columna y leerlo mejor
as.data.frame() #reconvertirlo a dataframe porque t() lo pasa a matriz
data.frame(variable = rownames(longi), media = longi$V1) %>% #crear un nuevo dataframe para poder ordenar por el nombre
arrange(desc(variable)) #ordenar por el nombre para tener la visión longitudinal
## variable media
## 1 TotalCharges 2283.30044
## 2 tenure 32.37115
## 3 MonthlyCharges 64.76169
De este análisis previo se desprende coherencia entre las variables numéricas. El producto entre la antigüedad media y el valor medio de los cargos mensuales, nos da aproximadamente el valor medio del total de cargos
Visualizamos por medio de histogramas las distribuciones de las variables MonthlyCharges y TotalCharges
hist(df$MonthlyCharges,breaks = 50)
hist(df$TotalCharges,breaks = 50)
2.8 - Acciones resultado del analisis de calidad de datos y exploratorio
Vamos a hacer lo siguiente:
- transformar a factor las variables de ‘a_factores’
- los atípicos no nos preocupan porque vamos a discretizar
- Eliminamos las observaciones con valores faltantes
- Eliminar la variable “TotalCharges”
df <- df %>%
mutate_at(a_factores,.funs = factor) %>%
filter(!is.na(TotalCharges)) %>%
select(-TotalCharges)
Revisamos para ver como queda el dataset
glimpse(df)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ gender <fct> Female, Male, Male, Male, Female, Female, Male, Fe...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes, No,...
## $ Dependents <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes, No,...
## $ tenure <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49...
## $ PhoneService <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes...
## $ MultipleLines <fct> No phone service, No, No, No phone service, No, Ye...
## $ InternetService <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic, Fibe...
## $ OnlineSecurity <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Yes, ...
## $ OnlineBackup <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, No, No...
## $ DeviceProtection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, No, No...
## $ TechSupport <fct> No, No, No, Yes, No, No, No, No, Yes, No, No, No i...
## $ StreamingTV <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No, No ...
## $ StreamingMovies <fct> No, No, No, No, No, Yes, No, No, Yes, No, No, No i...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One year...
## $ PaperlessBilling <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No, Yes,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, Bank...
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 2...
## $ Churn <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No",...
3 - Trasformación de datos
3.1 - Creación de la variable target
Creación de la variable abandono (para el entrenamiento)
df <- df %>%
mutate(TARGET_Churn = as.factor(ifelse(Churn == "Yes", 1, 0))) %>%
select(-Churn)
3.2 - Preparación de las variables independientes
3.2.1 - Preselección de variables independientes
Creamos una lista larga con todas las variables independientes.
ind_larga <- names(df) #lista con todas las variables
no_usar <- c('customerID', 'TARGET_Churn') #identificamos las que no queremos usar como variables predictoras
ind_larga<-setdiff(ind_larga,no_usar) #quitamos la que no usaremos
Guardamos la sesión poder recuperar el trabajo desde aquí
save.image(file='sesion1.RData')
load(file='sesion1.RData') #hay que cargar también los paquetes
3.2.1.1 - Preselección con RandomForest
pre_rf <- randomForest(formula = reformulate(ind_larga,'TARGET_Churn'), data= df,mtry=2,ntree=50, importance = T)
imp_rf <- importance(pre_rf)[,4] #como importance devuelve una matriz con varias métricas tenemos que extraer asi el decrecimiento en Gini que es el que mas nos interesa
imp_rf <- data.frame(VARIABLE = names(imp_rf), IMP_RF = imp_rf) #lo transformamos a dataframe
imp_rf <- imp_rf %>% arrange(desc(IMP_RF)) %>% mutate(RANKING_RF = 1:nrow(imp_rf)) #creamos el ranking
3.2.1.2 - Preselección con Information Value
temp <- mutate(df,TARGET_Churn = as.numeric(as.character(TARGET_Churn))) %>% as.data.frame() #transformo la target a numérico temporalmente porque este algoritmo necesita que este en numérico, y el as.character es para que lo convierta a 0 y 1, y no a 1 y 2
imp_iv <- smbinning.sumiv(temp[c(ind_larga,'TARGET_Churn')],y="TARGET_Churn")
##
##
|
| | 0%
|
|--- | 5%
|
|----- | 11%
|
|-------- | 16%
|
|----------- | 21%
|
|------------- | 26%
|
|---------------- | 32%
|
|------------------ | 37%
|
|--------------------- | 42%
|
|------------------------ | 47%
|
|-------------------------- | 53%
|
|----------------------------- | 58%
|
|-------------------------------- | 63%
|
|---------------------------------- | 68%
|
|------------------------------------- | 74%
|
|--------------------------------------- | 79%
|
|------------------------------------------ | 84%
|
|--------------------------------------------- | 89%
|
|----------------------------------------------- | 95%
|
|--------------------------------------------------| 100%
##
imp_iv <- imp_iv %>% mutate(Ranking = 1:nrow(imp_iv), IV = ifelse(is.na(.$IV),0,IV)) %>% select(-Process)
names(imp_iv) <- c('VARIABLE','IMP_IV','RANKING_IV')
3.2.1.3 - Preselección final
imp_final <- inner_join(imp_rf,imp_iv,by='VARIABLE') %>%
select(VARIABLE,IMP_RF,IMP_IV,RANKING_RF,RANKING_IV) %>% #ponerlos en orden mas legible
mutate(RANKING_TOT = RANKING_RF + RANKING_IV) %>%
arrange(RANKING_TOT)
imp_final
## VARIABLE IMP_RF IMP_IV RANKING_RF RANKING_IV RANKING_TOT
## 1 tenure 256.993053 0.8773 1 2 3
## 2 Contract 166.972065 1.2332 2 1 3
## 3 OnlineSecurity 104.450763 0.7153 4 3 7
## 4 TechSupport 89.737109 0.6971 6 4 10
## 5 MonthlyCharges 163.774471 0.4824 3 8 11
## 6 InternetService 71.079799 0.6152 7 5 12
## 7 PaymentMethod 93.655591 0.4557 5 9 14
## 8 OnlineBackup 56.912047 0.5265 8 6 14
## 9 DeviceProtection 42.800653 0.4976 9 7 16
## 10 StreamingTV 37.768192 0.3787 10 11 21
## 11 StreamingMovies 30.568998 0.3799 12 10 22
## 12 PaperlessBilling 36.711052 0.2020 11 12 23
## 13 Partner 29.701999 0.1179 14 14 28
## 14 Dependents 29.426063 0.1532 15 13 28
## 15 MultipleLines 30.527571 0.0081 13 16 29
## 16 SeniorCitizen 25.908471 0.1051 17 15 32
## 17 gender 26.748579 0.0004 16 18 34
## 18 PhoneService 8.519824 0.0007 18 17 35
Vamos a calcular la correlación entre los dos métodos de selección para ver si arrojan resultados similares, y así ver si resultan fiables
cor(imp_final$IMP_RF,imp_final$IMP_IV)
## [1] 0.7794791
Si nos dan fiabilidad los resultados ya que la correlación es alta, por tanto no hay disparidad en los criterios de selección
3.2.2 - Seleccionar la lista de variables finales del proyecto
Una vez identificadas las variables importantes utilizando los dos rankings, y como el número de candidatas a predictoras no es elevado, vamos a incluir en el modelo todas las variables
importantes <- ind_larga
df %>% select(all_of(importantes)) %>%
nearZeroVar(saveMetrics = TRUE)
## freqRatio percentUnique zeroVar nzv
## gender 1.018949 0.02844141 FALSE FALSE
## SeniorCitizen 5.157618 0.02844141 FALSE FALSE
## Partner 1.072502 0.02844141 FALSE FALSE
## Dependents 2.350167 0.02844141 FALSE FALSE
## tenure 1.693370 1.02389078 FALSE FALSE
## PhoneService 9.341176 0.02844141 FALSE FALSE
## MultipleLines 1.140883 0.04266212 FALSE FALSE
## InternetService 1.281457 0.04266212 FALSE FALSE
## OnlineSecurity 1.735484 0.04266212 FALSE FALSE
## OnlineBackup 1.272990 0.04266212 FALSE FALSE
## DeviceProtection 1.279570 0.04266212 FALSE FALSE
## TechSupport 1.701961 0.04266212 FALSE FALSE
## StreamingTV 1.039216 0.04266212 FALSE FALSE
## StreamingMovies 1.018308 0.04266212 FALSE FALSE
## Contract 2.299703 0.04266212 FALSE FALSE
## PaperlessBilling 1.455307 0.02844141 FALSE FALSE
## PaymentMethod 1.474439 0.05688282 FALSE FALSE
## MonthlyCharges 1.386364 22.52559727 FALSE FALSE
Verificamos que ninguna de las variables candidatas a predictoras incorpora ruido al modelo
Selección definitiva de variables:
finales <- union(c('customerID', 'TARGET_Churn'), importantes)
finales
## [1] "customerID" "TARGET_Churn" "gender" "SeniorCitizen"
## [5] "Partner" "Dependents" "tenure" "PhoneService"
## [9] "MultipleLines" "InternetService" "OnlineSecurity" "OnlineBackup"
## [13] "DeviceProtection" "TechSupport" "StreamingTV" "StreamingMovies"
## [17] "Contract" "PaperlessBilling" "PaymentMethod" "MonthlyCharges"
3.3 - Fichero final y limpieza del entorno
df <- df %>%
select(one_of(finales))
3.3.1 - Limpieza del entorno
Durante todo el proceso anterior hemos creado muchas variables y ficheros temporales, vamos a aprovechar para limpiarlo todo y dejarlo organizado antes de pasar a la siguiente fase
ls() #Vemos todo lo que tenemos cargado en el entorno
## [1] "a_factores" "df" "finales" "imp_final"
## [5] "imp_iv" "imp_rf" "importantes" "ind_larga"
## [9] "instalados" "longi" "no_usar" "out"
## [13] "paquetes" "pre_rf" "son_character" "temp"
rm(list=setdiff(ls(),'df')) #borramos todo excepto el nuevo df
# y vamos a dejar preparadas unas variables que nos van a facilitar cosas en el futuro:
target <- 'TARGET_Churn'
indep <- setdiff(names(df),c(target,'customerID'))
Vamos a guardar un caché temporal de datos
saveRDS(df,'cacheV1.rds')
3.4 - Creación de variables sintéticas
Cargamos el caché
df <- readRDS(file = 'cacheV1.rds')
3.4.1 - Transformación
Vamos a visualizar las variables categóricas
inspect_cat(df)[1:4]
## # A tibble: 18 x 4
## col_name cnt common common_pcnt
## <chr> <int> <chr> <dbl>
## 1 Contract 3 Month-to-month 55.1
## 2 customerID 7032 0002-ORFBO 0.0142
## 3 Dependents 2 No 70.2
## 4 DeviceProtection 3 No 44.0
## 5 gender 2 Male 50.5
## 6 InternetService 3 Fiber optic 44.0
## 7 MultipleLines 3 No 48.1
## 8 OnlineBackup 3 No 43.9
## 9 OnlineSecurity 3 No 49.7
## 10 PaperlessBilling 2 Yes 59.3
## 11 Partner 2 No 51.7
## 12 PaymentMethod 4 Electronic check 33.6
## 13 PhoneService 2 Yes 90.3
## 14 SeniorCitizen 2 0 83.8
## 15 StreamingMovies 3 No 39.5
## 16 StreamingTV 3 No 39.9
## 17 TARGET_Churn 2 0 73.4
## 18 TechSupport 3 No 49.4
Vamos a revisar en profundidad aquellas variables categóricas que tienen más de dos clases
df %>%
inspect_cat() %>%
show_plot()
a <- as.data.frame(inspect_cat(df)[1:4])
a %>%
filter(cnt >2 & cnt < 7043)
## col_name cnt common common_pcnt
## 1 Contract 3 Month-to-month 55.10523322
## 2 customerID 7032 0002-ORFBO 0.01422071
## 3 DeviceProtection 3 No 43.99886234
## 4 InternetService 3 Fiber optic 44.02730375
## 5 MultipleLines 3 No 48.13708760
## 6 OnlineBackup 3 No 43.89931741
## 7 OnlineSecurity 3 No 49.72980660
## 8 PaymentMethod 4 Electronic check 33.63196815
## 9 StreamingMovies 3 No 39.54778157
## 10 StreamingTV 3 No 39.94596132
## 11 TechSupport 3 No 49.37428896
Conclusiones:
Vamos a crear indicadores de tenencia de servicio:
-DeviceProtection, reducimos a dos niveles, “Yes”, “No”, agregando “No internet service” a “No”
-InternetService, reducimos a dos niveles, “Yes”, “No”, agregando “Fiber optic” y “DSL” a “Yes”
-MultipleLines, reducimos a dos niveles, “Yes”, “No”, agregando “No phone service” a “No”
-OnlineBackup, reducimos a dos niveles, “Yes”, “No”, agregando “No internet service” a “No”
-OnlineSecurity, reducimos a dos niveles, “Yes”, “No”, agregando “No internet service” a “No”
-StreamingMovies, reducimos a dos niveles, “Yes”, “No”, agregando “No internet service” a “No”
-StreamingTV, reducimos a dos niveles, “Yes”, “No”, agregando “No internet service” a “No”
-TechSupport, reducimos a dos niveles, “Yes”, “No”, agregando “No internet service” a “No”
df <- df %>%
mutate(DeviceProtection = as.factor(ifelse(DeviceProtection == "Yes", 1, 0)),
InternetService = as.factor(ifelse(InternetService == "No", 0, 1)),
MultipleLines = as.factor(ifelse(MultipleLines == "Yes", 1, 0)),
OnlineBackup = as.factor(ifelse(OnlineBackup == "Yes", 1, 0)),
OnlineSecurity = as.factor(ifelse(OnlineSecurity == "Yes", 1, 0)),
StreamingMovies = as.factor(ifelse(StreamingMovies == "Yes", 1, 0)),
StreamingTV = as.factor(ifelse(StreamingTV == "Yes", 1, 0)),
TechSupport = as.factor(ifelse(TechSupport == "Yes", 1, 0))
)
df <- df %>%
mutate(gender = as.factor(ifelse(gender == "Female", 1, 0)),
Partner = as.factor(ifelse(Partner == "Yes", 1, 0)),
Dependents = as.factor(ifelse(Dependents == "Yes", 1, 0)),
PhoneService = as.factor(ifelse(PhoneService == "Yes", 1, 0)),
PaperlessBilling = as.factor(ifelse(PaperlessBilling == "Yes", 1, 0))
)
3.5 - Discretización
Primero vamos a crear la función que va a discretizar de forma automática maximizando la capacidad predictiva de la nueva variable.
Ademas, como vamos a usar en la modelización un algoritmo lineal, que es la regresión logística, vamos a intentar que la discretización sea monotónica
discretizar <- function(vi,target){
temp_df <- data.frame(vi = vi, target = target)
#smbinning necesita que la target sea numérica
temp_df$target <- as.numeric(as.character(temp_df$target))
disc <- smbinning(temp_df, y = 'target', x = 'vi')
return(disc)
}
Discretizamos la variable numérica más la antigúedad
#ANTIGÜEDAD:
disc_temp_tenure <- discretizar(df$tenure,df$TARGET_Churn)
df_temp <- select(df,tenure,TARGET_Churn) #creamos este temporal porque smbinning.gen necesita que el df tenga el mismo numero de columnas que la salida de la funcion discretizar
df_temp <- smbinning.gen(df_temp,disc_temp_tenure,chrname = 'tenure_DISC')
#Metemos en df la nueva variable discretizada y eliminamos la original
df <- cbind(df,df_temp$tenure_DISC) %>% select(-tenure)
df <- df %>% mutate(tenure_DISC = V2) %>% select(-V2)
glimpse(df)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ TARGET_Churn <fct> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,...
## $ PhoneService <fct> 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,...
## $ OnlineSecurity <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,...
## $ OnlineBackup <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,...
## $ DeviceProtection <fct> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,...
## $ TechSupport <fct> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ StreamingTV <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One year...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, Bank...
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 2...
## $ tenure_DISC <fct> 01 <= 1, 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, 03 ...
#MONTHLYCHARGES:
disc_temp_MonthlyCharges <- discretizar(df$MonthlyCharges,df$TARGET_Churn)
df_temp <- select(df,MonthlyCharges,TARGET_Churn) #creamos este temporal porque smbinning.gen necesita que el df tenga el mismo numero de columnas que la salida de la funcion discretizar
df_temp <- smbinning.gen(df_temp,disc_temp_MonthlyCharges,chrname = 'MonthlyCharges_DISC')
#Metemos en df la nueva variable discretizada y eliminamos la original
df <- cbind(df,df_temp$MonthlyCharges_DISC) %>% select(-MonthlyCharges)
df <- df %>% mutate(MonthlyCharges_DISC = V2) %>% select(-V2)
glimpse(df)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795...
## $ TARGET_Churn <fct> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ OnlineSecurity <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ DeviceProtection <fct> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ TechSupport <fct> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, B...
## $ tenure_DISC <fct> 01 <= 1, 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, ...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 03 <= 68.8, 02 <= 55.95, 02 <= 55....
Ya tenemos las variables discretizadas
Vamos a hacer una inspección visual de todas las variables a ver si han salido bien
df %>%
select_if(is.factor) %>%
gather() %>%
ggplot(aes(value)) +
geom_bar() +
facet_wrap(~ key, scales = "free") +
theme(axis.text=element_text(size=4))#esto es para cambiar el tamaño del texto del eje y que se lea bien
Ahora vamos a analizar la penetración de la target en cada categoría para ver si las variables han salido monotónicas
a <- function(var1,var2) {
df_temp <- data.frame(var1 = df[[var1]],var2 = df[[var2]])
df_temp %>%
group_by(var1) %>%
summarise(Conteo = n(), Porc = mean(as.numeric(as.character(var2)))) %>%
ggplot(aes(var1,Porc)) + geom_bar(stat='identity') + xlab(var1)
}
df2_nombres <- df %>% select_if(is.factor) %>% names()
lapply(df2_nombres,function(x){a(x,'TARGET_Churn')})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
Antes de continuar vamos a guardar en un objeto de R las discretizaciones, porque las necesitaremos después para poner el modelo en producción
#Vamos a crear un objeto de tipo lista que es lo ideal para guardar objetos complejos como las discretizaciones
discretizaciones <- list(
disc_temp_tenure = disc_temp_tenure,
disc_temp_MonthlyCharges = disc_temp_MonthlyCharges
)
saveRDS(discretizaciones,'02_CortesDiscretizaciones.rds')
Vamos a reordernar las variables en el conjunto de datos
#creamos un vector con las variables centrales
centrales <- setdiff(names(df),c('customerID','TARGET_Churn'))
df <- df %>% select(
customerID,
one_of(centrales),
TARGET_Churn)
Comprobamos de nuevo
glimpse(df)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ OnlineSecurity <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ DeviceProtection <fct> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ TechSupport <fct> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, B...
## $ tenure_DISC <fct> 01 <= 1, 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, ...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 03 <= 68.8, 02 <= 55.95, 02 <= 55....
## $ TARGET_Churn <fct> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
3.6 - Limpieza
Limpiamos el entorno de cualquier cosa que no sea el dataframe
a_borrar <- setdiff(ls(),'df')
rm(list=c(a_borrar,'a_borrar'))
Guardamos otro cache temporal
saveRDS(df,'cacheV3.rds')
Cargamos el cache temporal
df <- readRDS('cacheV3.rds')
4.1 - Preparamos las funciones que vamos a necesitar:
Función para crear una matriz de confusión
confusion<-function(real,scoring,umbral){
conf<-table(real,scoring>=umbral)
if(ncol(conf)==2) return(conf) else return(NULL)
}
Funcion para calcular las métricas de los modelos: acierto, precisión, cobertura y F1
metricas<-function(matriz_conf){
acierto <- (matriz_conf[1,1] + matriz_conf[2,2]) / sum(matriz_conf) *100
precision <- matriz_conf[2,2] / (matriz_conf[2,2] + matriz_conf[1,2]) *100
cobertura <- matriz_conf[2,2] / (matriz_conf[2,2] + matriz_conf[2,1]) *100
F1 <- 2*precision*cobertura/(precision+cobertura)
salida<-c(acierto,precision,cobertura,F1)
return(salida)
}
Función para probar distintos umbrales y ver el efecto sobre precisión y cobertura
umbrales<-function(real,scoring){
umbrales<-data.frame(umbral=rep(0,times=19),acierto=rep(0,times=19),precision=rep(0,times=19),cobertura=rep(0,times=19),F1=rep(0,times=19))
cont <- 1
for (cada in seq(0.05,0.95,by = 0.05)){
datos<-metricas(confusion(real,scoring,cada))
registro<-c(cada,datos)
umbrales[cont,]<-registro
cont <- cont + 1
}
return(umbrales)
}
Funciones que calculan la curva ROC y el AUC
roc<-function(prediction){
r<-performance(prediction,'tpr','fpr')
plot(r)
}
auc<-function(prediction){
a<-performance(prediction,'auc')
return(a@y.values[[1]])
}
4.2 - Creamos las particiones de training (70%) y test (30%)
Establecemos una semilla para que los resultados sean reproducibles
set.seed(1966)
Generamos una variable aleatoria con una distribución 70-30
df$random<-sample(0:1,size = nrow(df),replace = T,prob = c(0.3,0.7))
Creamos los dataframes de entrenamiento y test
train<-filter(df,random==1)
test<-filter(df,random==0)
#Eliminamos ya la random para que no moleste
df$random <- NULL
4.3 - Creación del modelo de propensión
4.3.1 - Identificamos las variables
#Las independientes serán todas menos el código cliente y la target
independientes <- setdiff(names(df),c('customerID','TARGET_Churn'))
target <- 'TARGET_Churn'
4.3.2 - Creamos la formula para usar en el modelo
formula <- reformulate(independientes,target)
4.3.3 - Modelizamos con regresión logística
Primero vamos a hacer un modelo con todas las variables
formula_rl <- formula
rl<- glm(formula_rl,train,family=binomial(link='logit'))
summary(rl)
##
## Call:
## glm(formula = formula_rl, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2031 -0.6803 -0.2854 0.5935 3.2059
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.32770 0.27575 -1.188
## gender1 0.07486 0.07837 0.955
## SeniorCitizen1 0.27421 0.10113 2.711
## Partner1 0.12373 0.09480 1.305
## Dependents1 -0.21202 0.10866 -1.951
## PhoneService1 -0.49751 0.18639 -2.669
## MultipleLines1 0.35234 0.09623 3.661
## InternetService1 1.01152 0.37595 2.691
## OnlineSecurity1 -0.35266 0.10170 -3.468
## OnlineBackup1 -0.15682 0.09303 -1.686
## DeviceProtection1 -0.06723 0.09564 -0.703
## TechSupport1 -0.39839 0.10448 -3.813
## StreamingTV1 0.27837 0.09705 2.868
## StreamingMovies1 0.28556 0.09689 2.947
## ContractOne year -0.68488 0.12796 -5.352
## ContractTwo year -1.64104 0.22693 -7.231
## PaperlessBilling1 0.39840 0.09016 4.419
## PaymentMethodCredit card (automatic) -0.17206 0.13335 -1.290
## PaymentMethodElectronic check 0.24541 0.11217 2.188
## PaymentMethodMailed check -0.18247 0.13751 -1.327
## tenure_DISC02 <= 5 -0.96288 0.15420 -6.244
## tenure_DISC03 <= 16 -1.43990 0.14991 -9.605
## tenure_DISC04 <= 22 -2.03480 0.19256 -10.567
## tenure_DISC05 <= 49 -2.24234 0.16550 -13.549
## tenure_DISC06 <= 59 -2.53135 0.22267 -11.368
## tenure_DISC07 <= 70 -2.69302 0.23426 -11.496
## tenure_DISC08 > 70 -3.79466 0.43682 -8.687
## MonthlyCharges_DISC02 <= 55.95 0.03402 0.34081 0.100
## MonthlyCharges_DISC03 <= 68.8 -0.15090 0.40542 -0.372
## MonthlyCharges_DISC04 <= 106.75 0.79874 0.38173 2.092
## MonthlyCharges_DISC05 > 106.75 1.02781 0.46080 2.230
## Pr(>|z|)
## (Intercept) 0.234679
## gender1 0.339438
## SeniorCitizen1 0.006700 **
## Partner1 0.191832
## Dependents1 0.051041 .
## PhoneService1 0.007604 **
## MultipleLines1 0.000251 ***
## InternetService1 0.007133 **
## OnlineSecurity1 0.000525 ***
## OnlineBackup1 0.091858 .
## DeviceProtection1 0.482061
## TechSupport1 0.000137 ***
## StreamingTV1 0.004126 **
## StreamingMovies1 0.003205 **
## ContractOne year 0.000000086749010 ***
## ContractTwo year 0.000000000000478 ***
## PaperlessBilling1 0.000009926386231 ***
## PaymentMethodCredit card (automatic) 0.196948
## PaymentMethodElectronic check 0.028679 *
## PaymentMethodMailed check 0.184515
## tenure_DISC02 <= 5 0.000000000425799 ***
## tenure_DISC03 <= 16 < 0.0000000000000002 ***
## tenure_DISC04 <= 22 < 0.0000000000000002 ***
## tenure_DISC05 <= 49 < 0.0000000000000002 ***
## tenure_DISC06 <= 59 < 0.0000000000000002 ***
## tenure_DISC07 <= 70 < 0.0000000000000002 ***
## tenure_DISC08 > 70 < 0.0000000000000002 ***
## MonthlyCharges_DISC02 <= 55.95 0.920479
## MonthlyCharges_DISC03 <= 68.8 0.709734
## MonthlyCharges_DISC04 <= 106.75 0.036401 *
## MonthlyCharges_DISC05 > 106.75 0.025716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5735.7 on 4902 degrees of freedom
## Residual deviance: 4024.8 on 4872 degrees of freedom
## AIC: 4086.8
##
## Number of Fisher Scoring iterations: 6
Revisamos la significatividad y mantenemos aquellas variables que tengan tres estrellas en alguna categoría, esto es, un 99,9%
a_mantener <- c(
'MultipleLines',
'Contract',
'PaperlessBilling',
'tenure_DISC'
)
Volvemos a modelizar teniendo en cuenta la selección previa
formula_rl <- reformulate(a_mantener,target)
rl<- glm(formula_rl,train,family=binomial(link='logit'))
summary(rl)
##
## Call:
## glm(formula = formula_rl, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8857 -0.7630 -0.3361 0.8289 3.0266
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0001517 0.1106689 0.001 0.999
## MultipleLines1 0.7628045 0.0839170 9.090 < 0.0000000000000002 ***
## ContractOne year -1.0479130 0.1177975 -8.896 < 0.0000000000000002 ***
## ContractTwo year -2.3755733 0.2121735 -11.196 < 0.0000000000000002 ***
## PaperlessBilling1 0.8299346 0.0828428 10.018 < 0.0000000000000002 ***
## tenure_DISC02 <= 5 -0.7012579 0.1376750 -5.094 0.000000351375139 ***
## tenure_DISC03 <= 16 -1.1444295 0.1306145 -8.762 < 0.0000000000000002 ***
## tenure_DISC04 <= 22 -1.7429628 0.1715311 -10.161 < 0.0000000000000002 ***
## tenure_DISC05 <= 49 -1.7975089 0.1372428 -13.097 < 0.0000000000000002 ***
## tenure_DISC06 <= 59 -1.9152855 0.1931056 -9.918 < 0.0000000000000002 ***
## tenure_DISC07 <= 70 -2.0531326 0.2000634 -10.262 < 0.0000000000000002 ***
## tenure_DISC08 > 70 -3.0242939 0.4104166 -7.369 0.000000000000172 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5735.7 on 4902 degrees of freedom
## Residual deviance: 4408.8 on 4891 degrees of freedom
## AIC: 4432.8
##
## Number of Fisher Scoring iterations: 6
Vemos que ahora ya todas las variables tienen al menos una categoría con 3 estrellas de significación
Vamos a mirar el signo de los coeficientes, que deberá seguir la lógica de negocio: todas las variables tienen lógica, asi que vamos a comprobar este modelo sobre el conjunto de test
Y calculamos el pseudo R cuadrado:
pr2_rl <- 1 -(rl$deviance / rl$null.deviance)
pr2_rl
## [1] 0.2313398
Aplicamos el modelo al conjunto de test, generando un vector con las probabilidades
rl_predict<-predict(rl,test,type = 'response')
Visualizamos
plot(rl_predict~test$TARGET_Churn)
Ahora tenemos que transformar la probabilidad en una decisión de si el cliente va a abandonar o no.
A priori vemos que el modelo discrimina bien entre las dos clases de la target
Con la función umbrales probamos diferentes cortes
umb_rl<-umbrales(test$TARGET_Churn,rl_predict)
umb_rl
## umbral acierto precision cobertura F1
## 1 0.05 46.40676 31.74123 97.765363 47.923323
## 2 0.10 55.61296 35.71429 94.972067 51.908397
## 3 0.15 62.84641 39.53871 89.385475 54.825814
## 4 0.20 64.30249 40.47822 88.268156 55.503513
## 5 0.25 70.36167 45.25253 83.426443 58.677145
## 6 0.30 75.24659 50.62500 75.418994 60.583396
## 7 0.35 76.13903 51.98358 70.763501 59.936909
## 8 0.40 77.12541 53.56125 70.018622 60.694108
## 9 0.45 78.34664 59.04762 46.182495 51.828631
## 10 0.50 78.11179 59.12596 42.830540 49.676026
## 11 0.55 77.87694 65.42056 26.070764 37.283622
## 12 0.60 77.87694 65.42056 26.070764 37.283622
## 13 0.65 77.40723 70.58824 17.877095 28.528975
## 14 0.70 76.09206 76.92308 7.448790 13.582343
## 15 0.75 75.38751 93.33333 2.607076 5.072464
## 16 0.80 75.38751 93.33333 2.607076 5.072464
## 17 0.85 0.85000 0.85000 0.850000 0.850000
## 18 0.90 0.90000 0.90000 0.900000 0.900000
## 19 0.95 0.95000 0.95000 0.950000 0.950000
Seleccionamos el umbral que maximiza la F1
umbral_final_rl<-umb_rl[which.max(umb_rl$F1),1]
umbral_final_rl
## [1] 0.4
Evaluamos la matriz de confusión y las métricas con el umbral optimizado
confusion(test$TARGET_Churn,rl_predict,umbral_final_rl)
##
## real FALSE TRUE
## 0 1266 326
## 1 161 376
rl_metricas<-filter(umb_rl,umbral==umbral_final_rl)
rl_metricas
## umbral acierto precision cobertura F1
## 1 0.4 77.12541 53.56125 70.01862 60.69411
Evaluamos la ROC
#creamos el objeto prediction
rl_prediction<-prediction(rl_predict,test$TARGET_Churn)
#visualizamos la ROC
roc(rl_prediction)
Sacamos las métricas definitivas incluyendo el AUC
rl_metricas<-cbind(rl_metricas,AUC=round(auc(rl_prediction),2)*100)
print(t(rl_metricas))
## [,1]
## umbral 0.40000
## acierto 77.12541
## precision 53.56125
## cobertura 70.01862
## F1 60.69411
## AUC 82.00000
4.3.4 - Modelizamos con Arboles de decisión
Creamos el primer modelo
formula_ar <- formula
ar<-rpart(formula_ar, train, method = 'class', parms = list(
split = "information"),
control = rpart.control(cp = 0.00001))
Revisamos donde el error de validación cruzada empieza a crecer
printcp(ar)
##
## Classification tree:
## rpart(formula = formula_ar, data = train, method = "class", parms = list(split = "information"),
## control = rpart.control(cp = 0.00001))
##
## Variables actually used in tree construction:
## [1] Contract Dependents DeviceProtection
## [4] gender InternetService MonthlyCharges_DISC
## [7] MultipleLines OnlineBackup OnlineSecurity
## [10] PaperlessBilling Partner PaymentMethod
## [13] PhoneService SeniorCitizen StreamingMovies
## [16] StreamingTV TechSupport tenure_DISC
##
## Root node error: 1332/4903 = 0.27167
##
## n= 4903
##
## CP nsplit rel error xerror xstd
## 1 0.07182182 0 1.00000 1.00000 0.023384
## 2 0.00700701 3 0.78453 0.78453 0.021528
## 3 0.00525526 6 0.76351 0.79505 0.021632
## 4 0.00500501 9 0.74775 0.79505 0.021632
## 5 0.00337838 12 0.73273 0.78228 0.021505
## 6 0.00300300 21 0.69970 0.78754 0.021558
## 7 0.00225225 22 0.69670 0.76952 0.021376
## 8 0.00187688 26 0.68769 0.76351 0.021315
## 9 0.00180180 30 0.68018 0.76652 0.021345
## 10 0.00175175 35 0.67117 0.76652 0.021345
## 11 0.00150150 42 0.65766 0.77703 0.021453
## 12 0.00112613 57 0.63514 0.77553 0.021437
## 13 0.00100100 61 0.63063 0.76952 0.021376
## 14 0.00075075 64 0.62763 0.77553 0.021437
## 15 0.00050050 77 0.61787 0.78979 0.021580
## 16 0.00045045 83 0.61486 0.79505 0.021632
## 17 0.00037538 88 0.61261 0.81156 0.021793
## 18 0.00025025 96 0.60961 0.81456 0.021822
## 19 0.00018769 105 0.60736 0.82357 0.021908
## 20 0.00001000 109 0.60661 0.83183 0.021986
plotcp(ar)
Parece que minimiza aprox en 0.002 de complejidad Generamos un nuevo árbol con ese parámetro Ademas vamos a incluir un nuevo parametro para que el árbol no tenga mas de 10 niveles
ar<-rpart(formula, train, method = 'class', parms = list(
split = "information"),
control = rpart.control(cp = 0.002,maxdepth = 10))
Revisamos de nuevo la complejidad
printcp(ar)
##
## Classification tree:
## rpart(formula = formula, data = train, method = "class", parms = list(split = "information"),
## control = rpart.control(cp = 0.002, maxdepth = 10))
##
## Variables actually used in tree construction:
## [1] Contract DeviceProtection gender
## [4] InternetService MonthlyCharges_DISC MultipleLines
## [7] OnlineBackup OnlineSecurity PaperlessBilling
## [10] PaymentMethod PhoneService StreamingTV
## [13] TechSupport tenure_DISC
##
## Root node error: 1332/4903 = 0.27167
##
## n= 4903
##
## CP nsplit rel error xerror xstd
## 1 0.0718218 0 1.00000 1.00000 0.023384
## 2 0.0070070 3 0.78453 0.78453 0.021528
## 3 0.0052553 6 0.76351 0.79054 0.021588
## 4 0.0050050 9 0.74775 0.78378 0.021521
## 5 0.0033784 12 0.73273 0.78904 0.021573
## 6 0.0030030 21 0.69970 0.76802 0.021361
## 7 0.0022523 22 0.69670 0.76502 0.021330
## 8 0.0020000 26 0.68769 0.76426 0.021322
plotcp(ar)
Ahora parece bastante estable
Vamos a crear el gráfico del árbol para analizarlo
rpart.plot(ar,type=2,extra = 7, under = TRUE,under.cex = 0.7,fallen.leaves=F,gap = 0,cex=0.2,yesno = 2,box.palette = "GnYlRd",branch.lty = 3)
Vamos a sacar las reglas que podrían ser utilizadas por ejemplo para hacer una implantación del árbol
rpart.rules(ar,style = 'tall',cover = T)
## TARGET_Churn is 0.07 with cover 44% when
## Contract is One year or Two year
##
## TARGET_Churn is 0.08 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## MultipleLines is 1
## OnlineSecurity is 1
## gender is 0
##
## TARGET_Churn is 0.20 with cover 14% when
## Contract is Month-to-month
## tenure_DISC is 03 <= 16 or 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
##
## TARGET_Churn is 0.22 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1
## MonthlyCharges_DISC is 04 <= 106.75
## TechSupport is 1
##
## TARGET_Churn is 0.25 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## PaymentMethod is Credit card (automatic) or Mailed check
## TechSupport is 0
## InternetService is 1
## PhoneService is 0
## PaperlessBilling is 0
##
## TARGET_Churn is 0.26 with cover 4% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## InternetService is 0
##
## TARGET_Churn is 0.28 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Bank transfer (automatic) or Credit card (automatic) or Mailed check
## MultipleLines is 0
## PaperlessBilling is 0
## gender is 1
##
## TARGET_Churn is 0.29 with cover 7% when
## Contract is Month-to-month
## tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Bank transfer (automatic) or Credit card (automatic) or Mailed check
##
## TARGET_Churn is 0.30 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## TechSupport is 0
## InternetService is 1
## PhoneService is 1
## OnlineSecurity is 1
##
## TARGET_Churn is 0.33 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## MultipleLines is 0
## PaperlessBilling is 0
## gender is 0
##
## TARGET_Churn is 0.33 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## TechSupport is 1
## InternetService is 1
##
## TARGET_Churn is 0.33 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## TechSupport is 0
## InternetService is 1
## PhoneService is 1
## OnlineSecurity is 0
## OnlineBackup is 1
##
## TARGET_Churn is 0.37 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Electronic check
## DeviceProtection is 1
## StreamingTV is 0
##
## TARGET_Churn is 0.40 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Electronic check
## TechSupport is 1
## DeviceProtection is 1
## StreamingTV is 1
##
## TARGET_Churn is 0.42 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## PaymentMethod is Bank transfer (automatic) or Electronic check
## TechSupport is 0
## InternetService is 1
## PhoneService is 1
## OnlineSecurity is 0
## OnlineBackup is 0
##
## TARGET_Churn is 0.43 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Mailed check
## MultipleLines is 0
## PaperlessBilling is 1
##
## TARGET_Churn is 0.44 with cover 5% when
## Contract is Month-to-month
## tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Electronic check
## DeviceProtection is 0
##
## TARGET_Churn is 0.54 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## PaymentMethod is Bank transfer (automatic) or Electronic check
## TechSupport is 0
## InternetService is 1
## PhoneService is 1
## OnlineSecurity is 0
## OnlineBackup is 0
##
## TARGET_Churn is 0.61 with cover 4% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Bank transfer (automatic) or Credit card (automatic) or Electronic check
## MultipleLines is 0
## PaperlessBilling is 1
##
## TARGET_Churn is 0.65 with cover 2% when
## Contract is Month-to-month
## tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Electronic check
## TechSupport is 0
## DeviceProtection is 1
## StreamingTV is 1
##
## TARGET_Churn is 0.66 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## PaymentMethod is Credit card (automatic) or Mailed check
## TechSupport is 0
## InternetService is 1
## PhoneService is 1
## OnlineSecurity is 0
## OnlineBackup is 0
##
## TARGET_Churn is 0.67 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## PaymentMethod is Credit card (automatic) or Mailed check
## TechSupport is 0
## InternetService is 1
## PhoneService is 0
## PaperlessBilling is 1
##
## TARGET_Churn is 0.68 with cover 0% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## MultipleLines is 1
## OnlineSecurity is 1
## gender is 1
##
## TARGET_Churn is 0.70 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## PaymentMethod is Electronic check
## MultipleLines is 0
## PaperlessBilling is 0
## gender is 1
##
## TARGET_Churn is 0.77 with cover 5% when
## Contract is Month-to-month
## tenure_DISC is 02 <= 5 or 03 <= 16
## MonthlyCharges_DISC is 04 <= 106.75
## MultipleLines is 1
## OnlineSecurity is 0
##
## TARGET_Churn is 0.85 with cover 1% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1 or 02 <= 5
## MonthlyCharges_DISC is 01 <= 26.9 or 02 <= 55.95 or 03 <= 68.8 or 05 > 106.75
## PaymentMethod is Bank transfer (automatic) or Electronic check
## TechSupport is 0
## InternetService is 1
## PhoneService is 0
##
## TARGET_Churn is 0.90 with cover 3% when
## Contract is Month-to-month
## tenure_DISC is 01 <= 1
## MonthlyCharges_DISC is 04 <= 106.75
## TechSupport is 0
#style sirve para que la salida sea mas legible y cover añade el % de casos e los que aplica la regla
Podemos llevarnos el nodo final de cada cliente a un data.frame para poder hacer una explotacion posterior
#Para ello usaremos el predict específico de rpart y con el parámetro nn
ar_numnodos<-rpart.predict(ar,test,nn = T)
head(ar_numnodos)
## 0 1 nn
## 1 0.1489362 0.8510638 223
## 2 0.9281106 0.0718894 2
## 3 0.6000000 0.4000000 238
## 4 0.9281106 0.0718894 2
## 5 0.6279070 0.3720930 118
## 6 0.8025937 0.1974063 12
Vamos a calcular los scorings y evaluar el modelo
ar_predict<-predict(ar,test,type = 'prob')[,2]
Visualizamos
plot(ar_predict~test$TARGET_Churn)
Verificamos que el modelo discrimina bien las clases de la target
Con la función umbrales probamos diferentes cortes
umb_ar<-umbrales(test$TARGET_Churn,ar_predict)
umb_ar
## umbral acierto precision cobertura F1
## 1 0.05 0.05000 0.05000 0.05000 0.05000
## 2 0.10 66.08736 41.86456 88.64060 56.86977
## 3 0.15 66.08736 41.86456 88.64060 56.86977
## 4 0.20 73.97839 49.00817 78.21229 60.25825
## 5 0.25 73.93142 48.94860 78.02607 60.15793
## 6 0.30 78.53452 56.45161 65.17691 60.50130
## 7 0.35 79.05120 58.11052 60.70764 59.38069
## 8 0.40 79.33302 59.03166 59.03166 59.03166
## 9 0.45 78.86332 61.06870 44.69274 51.61290
## 10 0.50 78.86332 61.06870 44.69274 51.61290
## 11 0.55 78.53452 60.47120 43.01676 50.27203
## 12 0.60 78.53452 60.47120 43.01676 50.27203
## 13 0.65 77.97088 63.70968 29.42272 40.25478
## 14 0.70 78.39361 68.78049 26.25698 38.00539
## 15 0.75 78.44058 70.31250 25.13966 37.03704
## 16 0.80 76.93753 78.75000 11.73184 20.42139
## 17 0.85 76.93753 78.75000 11.73184 20.42139
## 18 0.90 0.90000 0.90000 0.90000 0.90000
## 19 0.95 0.95000 0.95000 0.95000 0.95000
Seleccionamos automáticamente el mejor umbral
umbral_final_ar<-umb_ar[which.max(umb_ar$F1),1]
umbral_final_ar
## [1] 0.3
Evaluamos la matriz de confusión y las métricas con el umbral optimizado
confusion(test$TARGET_Churn,ar_predict,umbral_final_ar)
##
## real FALSE TRUE
## 0 1322 270
## 1 187 350
ar_metricas<-filter(umb_ar,umbral==umbral_final_ar)
ar_metricas
## umbral acierto precision cobertura F1
## 1 0.3 78.53452 56.45161 65.17691 60.5013
Evaluamos la ROC
#creamos el objeto prediction
ar_prediction<-prediction(ar_predict,test$TARGET_Churn)
#visualizamos la ROC
roc(ar_prediction)
Sacamos las métricas definitivas incluyendo el AUC
ar_metricas<-cbind(ar_metricas,AUC=round(auc(ar_prediction),2)*100)
print(t(ar_metricas))
## [,1]
## umbral 0.30000
## acierto 78.53452
## precision 56.45161
## cobertura 65.17691
## F1 60.50130
## AUC 81.00000
La métrica AUC de este modelo (con sus hiperparámetros), es inferior al modelo rl
4.3.5 - Modelizamos con Random Forest
Creamos el modelo
formula_rf <- formula
rf<-randomForest(formula_rf,train,importance=T)
rf
##
## Call:
## randomForest(formula = formula_rf, data = train, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 22.11%
## Confusion matrix:
## 0 1 class.error
## 0 3119 452 0.1265752
## 1 632 700 0.4744745
Visualizamos las variables mas importantes
varImpPlot(rf)
Como hay dos criterios vamos a crear una única variable agregada y visualizarla para tener una mejor idea de la importancia de cada variable
importancia <- importance(rf)[,3:4]
#normalizamos para poner las dos variables en la misma escala. los valores negativos son las que menos predicen y los positivos las que mas
importancia_norm <- as.data.frame(scale(importancia))
#creamos una única variable como suma de las otras
importancia_norm <- importancia_norm %>% mutate(
Variable = rownames(importancia_norm),
Imp_tot = MeanDecreaseAccuracy + MeanDecreaseGini) %>%
mutate(Imp_tot = Imp_tot + abs(min(Imp_tot))) %>%
arrange(desc(Imp_tot)) %>%
select(Variable,Imp_tot,MeanDecreaseAccuracy,MeanDecreaseGini)
#hacemos un gráfico para ver la curva de caída de importancia
ggplot(importancia_norm, aes(reorder(Variable,-Imp_tot),Imp_tot)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90,size = 7))
importancia_norm
## Variable Imp_tot MeanDecreaseAccuracy MeanDecreaseGini
## 1 tenure_DISC 6.6379068 2.0261585 3.0655569
## 2 Contract 5.2401732 2.1427012 1.5512806
## 3 MonthlyCharges_DISC 3.5992376 1.3220516 0.7309946
## 4 PaymentMethod 3.0103400 0.2887059 1.1754427
## 5 OnlineSecurity 1.7155692 0.5183707 -0.3489929
## 6 TechSupport 1.6545280 0.4521642 -0.3438275
## 7 InternetService 1.5337450 0.5916553 -0.6041017
## 8 OnlineBackup 0.9720426 -0.1731207 -0.4010281
## 9 DeviceProtection 0.6905442 -0.3982576 -0.4573896
## 10 Partner 0.5660724 -0.5546091 -0.4255099
## 11 StreamingMovies 0.4770664 -0.5959509 -0.4731742
## 12 PaperlessBilling 0.4603806 -0.7212204 -0.3645905
## 13 Dependents 0.4334663 -0.6031939 -0.5095312
## 14 PhoneService 0.2200307 -0.4800449 -0.8461158
## 15 gender 0.2190837 -1.0293619 -0.2977458
## 16 SeniorCitizen 0.2138680 -0.8293266 -0.5029968
## 17 StreamingTV 0.1873908 -0.8744265 -0.4843742
## 18 MultipleLines 0.0000000 -1.0822947 -0.4638967
La caída es bastante gradual, así que no hay corte claro. Podemos coger por ejemplo hasta PaymentMethod incluido, que tiene una importancia total de 3,01.
a_mantener <- importancia_norm %>%
filter(Imp_tot > 3.0) %>%
select(Variable)
#Extraemos los nombres como un vector
a_mantener <- as.character((a_mantener$Variable))
Creamos de nuevo el modelo con las nuevas variables
formula_rf <- reformulate(a_mantener,target)
rf<-randomForest(formula_rf,train,importance=T)
rf
##
## Call:
## randomForest(formula = formula_rf, data = train, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 21.19%
## Confusion matrix:
## 0 1 class.error
## 0 3187 384 0.1075329
## 1 655 677 0.4917417
Aplicamos el modelo al conjunto de test, generando un vector con las probabilidades
Hay que poner el type=prob para tener el scoring, lo cual nos dara una matriz que nos tenemos que quedar con la segunda columna
rf_predict<-predict(rf,test,type = 'prob')[,2]
Visualizamos
plot(rf_predict~test$TARGET_Churn)
Con la función umbrales probamos diferentes cortes
umb_rf<-umbrales(test$TARGET_Churn,rf_predict)
umb_rf
## umbral acierto precision cobertura F1
## 1 0.05 71.11320 45.96273 82.68156 59.08184
## 2 0.10 74.72992 49.93880 75.97765 60.26588
## 3 0.15 76.98450 53.38129 69.08752 60.22727
## 4 0.20 77.64209 54.55904 67.97020 60.53068
## 5 0.25 78.25270 55.79937 66.29423 60.59574
## 6 0.30 78.95726 57.23577 65.54935 61.11111
## 7 0.35 79.28605 58.13559 63.87337 60.86957
## 8 0.40 79.66181 61.40351 52.14153 56.39476
## 9 0.45 79.70878 61.58940 51.95531 56.36364
## 10 0.50 79.47393 61.52074 49.72067 54.99485
## 11 0.55 79.47393 61.73709 48.97579 54.62098
## 12 0.60 79.00423 61.71875 44.13408 51.46580
## 13 0.65 79.23908 63.61032 41.34078 50.11287
## 14 0.70 79.37999 65.03067 39.47858 49.13094
## 15 0.75 79.23908 65.78073 36.87151 47.25537
## 16 0.80 79.05120 65.31987 36.12663 46.52278
## 17 0.85 78.76938 65.23297 33.89199 44.60784
## 18 0.90 78.72240 66.27907 31.84358 43.01887
## 19 0.95 77.82997 64.44444 27.00186 38.05774
Seleccionamos automáticamente el mejor umbral
umbral_final_rf<-umb_rf[which.max(umb_rf$F1),1]
umbral_final_rf
## [1] 0.3
Evaluamos la matriz de confusión y las métricas con el umbral optimizado
confusion(test$TARGET_Churn,rf_predict,umbral_final_rf)
##
## real FALSE TRUE
## 0 1329 263
## 1 185 352
rf_metricas<-filter(umb_rf,umbral==umbral_final_rf)
rf_metricas
## umbral acierto precision cobertura F1
## 1 0.3 78.95726 57.23577 65.54935 61.11111
Evaluamos la ROC
#creamos el objeto prediction
rf_prediction<-prediction(rf_predict,test$TARGET_Churn)
#visualizamos la ROC
roc(rf_prediction)
Sacamos las métricas definitivas incluyendo el AUC
rf_metricas<-cbind(rf_metricas,AUC=round(auc(rf_prediction),2)*100)
print(t(rf_metricas))
## [,1]
## umbral 0.30000
## acierto 78.95726
## precision 57.23577
## cobertura 65.54935
## F1 61.11111
## AUC 82.00000
4.3.6 - Modelizamos con Support Vector Machines
Creamos el modelo
set.seed(1966)
formula_svm <- formula
tic()
svm <- svm(formula_svm,train,kernel="linear", type = 'C-classification', probability = TRUE, cost=0.1, scale=FALSE)
toc()
## 6.66 sec elapsed
svm
##
## Call:
## svm(formula = formula_svm, data = train, kernel = "linear", type = "C-classification",
## probability = TRUE, cost = 0.1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
##
## Number of Support Vectors: 2388
Generamos un resumen del modelo generado
summary(svm)
##
## Call:
## svm(formula = formula_svm, data = train, kernel = "linear", type = "C-classification",
## probability = TRUE, cost = 0.1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
##
## Number of Support Vectors: 2388
##
## ( 1199 1189 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
Generamos el vector de probabilidades aplicando predict al conjunto de test
svm_predict <- predict(svm, test, probability=TRUE)
svm_predict <- attr(svm_predict, "probabilities")
svm_predict <- (svm_predict)[,2]
head(svm_predict)
## 1 2 3 4 5 6
## 0.63691574 0.08352067 0.25548505 0.12419165 0.41578567 0.06015107
Graficamos el modelo generado
plot(svm_predict~test$TARGET_Churn)
Con la función umbrales probamos diferentes cortes
umb_svm<-umbrales(test$TARGET_Churn,svm_predict)
umb_svm
## umbral acierto precision cobertura F1
## 1 0.05 44.52795 31.16959 99.2551210 47.441032
## 2 0.10 55.56599 35.68929 94.9720670 51.881994
## 3 0.15 65.33584 41.43223 90.5027933 56.842105
## 4 0.20 71.11320 46.05263 84.7299814 59.672131
## 5 0.25 75.57539 51.04551 77.2811918 61.481481
## 6 0.30 78.20573 55.37555 70.0186220 61.842105
## 7 0.35 79.47393 58.47458 64.2458101 61.224490
## 8 0.40 79.99061 60.99010 57.3556797 59.117083
## 9 0.45 79.99061 62.47191 51.7690875 56.619145
## 10 0.50 79.33302 62.21662 45.9962756 52.890792
## 11 0.55 79.19211 63.50575 41.1545624 49.943503
## 12 0.60 79.37999 65.70513 38.1750466 48.292108
## 13 0.65 79.00423 67.04545 32.9608939 44.194757
## 14 0.70 79.28605 72.85714 28.4916201 40.963855
## 15 0.75 78.15876 76.86567 19.1806331 30.700447
## 16 0.80 76.23297 81.63265 7.4487896 13.651877
## 17 0.85 75.01174 100.00000 0.9310987 1.845018
## 18 0.90 0.90000 0.90000 0.9000000 0.900000
## 19 0.95 0.95000 0.95000 0.9500000 0.950000
Seleccionamos automáticamente el mejor umbral
umbral_final_svm<-umb_svm[which.max(umb_svm$F1),1]
umbral_final_svm
## [1] 0.3
Evaluamos la matriz de confusión y las métricas con el umbral optimizado
confusion(test$TARGET_Churn,svm_predict,umbral_final_svm)
##
## real FALSE TRUE
## 0 1289 303
## 1 161 376
svm_metricas<-filter(umb_svm,umbral==umbral_final_svm)
svm_metricas
## umbral acierto precision cobertura F1
## 1 0.3 78.20573 55.37555 70.01862 61.84211
Evaluamos la ROC
#creamos el objeto prediction
svm_prediction<-prediction(svm_predict,test$TARGET_Churn)
#visualizamos la ROC
roc(svm_prediction)
Sacamos las métricas definitivas incluyendo el AUC
svm_metricas<-cbind(svm_metricas,AUC=round(auc(svm_prediction),2)*100)
print(t(svm_metricas))
## [,1]
## umbral 0.30000
## acierto 78.20573
## precision 55.37555
## cobertura 70.01862
## F1 61.84211
## AUC 83.00000
4.3.7 - Gradient Boosting con caret y ajuste de hiperparámetros
df_gbm <- df
glimpse(df_gbm)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ OnlineSecurity <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ DeviceProtection <fct> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ TechSupport <fct> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, B...
## $ tenure_DISC <fct> 01 <= 1, 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, ...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 03 <= 68.8, 02 <= 55.95, 02 <= 55....
## $ TARGET_Churn <fct> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
df_gbm <- df_gbm %>%
mutate(TARGET_Churn = ifelse(TARGET_Churn == 0, "No", "Yes"))
df_gbm$TARGET_Churn <- as.factor(df_gbm$TARGET_Churn)
glimpse(df_gbm)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ OnlineSecurity <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ DeviceProtection <fct> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ TechSupport <fct> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, B...
## $ tenure_DISC <fct> 01 <= 1, 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, ...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 03 <= 68.8, 02 <= 55.95, 02 <= 55....
## $ TARGET_Churn <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, No,...
train_gbm <- train %>% select(-random)
glimpse(train_gbm)
## Rows: 4,903
## Columns: 20
## $ customerID <chr> "5575-GNVDE", "3668-QPYBK", "7795-CFOCW", "9237...
## $ gender <fct> 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ Partner <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,...
## $ PhoneService <fct> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ MultipleLines <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,...
## $ OnlineBackup <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,...
## $ DeviceProtection <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,...
## $ StreamingTV <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,...
## $ Contract <fct> One year, Month-to-month, One year, Month-to-mo...
## $ PaperlessBilling <fct> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,...
## $ PaymentMethod <fct> Mailed check, Mailed check, Bank transfer (auto...
## $ tenure_DISC <fct> 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, 03 <= 16,...
## $ MonthlyCharges_DISC <fct> 03 <= 68.8, 02 <= 55.95, 02 <= 55.95, 04 <= 106...
## $ TARGET_Churn <fct> 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,...
train_gbm <- train_gbm %>%
mutate(TARGET_Churn = ifelse(TARGET_Churn == 0, "No", "Yes"))
train_gbm$TARGET_Churn <- as.factor(train_gbm$TARGET_Churn)
glimpse(train_gbm)
## Rows: 4,903
## Columns: 20
## $ customerID <chr> "5575-GNVDE", "3668-QPYBK", "7795-CFOCW", "9237...
## $ gender <fct> 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ Partner <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,...
## $ PhoneService <fct> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ MultipleLines <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,...
## $ OnlineBackup <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,...
## $ DeviceProtection <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,...
## $ StreamingTV <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,...
## $ Contract <fct> One year, Month-to-month, One year, Month-to-mo...
## $ PaperlessBilling <fct> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,...
## $ PaymentMethod <fct> Mailed check, Mailed check, Bank transfer (auto...
## $ tenure_DISC <fct> 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, 03 <= 16,...
## $ MonthlyCharges_DISC <fct> 03 <= 68.8, 02 <= 55.95, 02 <= 55.95, 04 <= 106...
## $ TARGET_Churn <fct> No, Yes, No, Yes, Yes, No, No, Yes, No, No, No,...
test_gbm <- test %>% select(-random)
glimpse(test_gbm)
## Rows: 2,129
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "7469-LKBCI", "5129-JLPIS", "9959...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,...
## $ Dependents <fct> 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,...
## $ InternetService <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,...
## $ DeviceProtection <fct> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ StreamingMovies <fct> 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...
## $ Contract <fct> Month-to-month, Two year, Month-to-month, Two y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,...
## $ PaymentMethod <fct> Electronic check, Credit card (automatic), Elec...
## $ tenure_DISC <fct> 01 <= 1, 03 <= 16, 05 <= 49, 08 > 70, 04 <= 22,...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 01 <= 26.9, 04 <= 106.75, 04 <= 10...
## $ TARGET_Churn <fct> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,...
test_gbm <- test_gbm %>%
mutate(TARGET_Churn = ifelse(TARGET_Churn == 0, "No", "Yes"))
test_gbm$TARGET_Churn <- as.factor(test_gbm$TARGET_Churn)
glimpse(test_gbm)
## Rows: 2,129
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "7469-LKBCI", "5129-JLPIS", "9959...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,...
## $ Dependents <fct> 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,...
## $ InternetService <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,...
## $ DeviceProtection <fct> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ StreamingMovies <fct> 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...
## $ Contract <fct> Month-to-month, Two year, Month-to-month, Two y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,...
## $ PaymentMethod <fct> Electronic check, Credit card (automatic), Elec...
## $ tenure_DISC <fct> 01 <= 1, 03 <= 16, 05 <= 49, 08 > 70, 04 <= 22,...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 01 <= 26.9, 04 <= 106.75, 04 <= 10...
## $ TARGET_Churn <fct> No, No, No, No, No, No, Yes, Yes, No, No, No, Y...
Creamos un objeto con los hiperparámetros a ajustar
hiperparametros <- expand.grid(interaction.depth = c(1, 2),
n.trees = c(500, 1000, 2000),
shrinkage = c(0.001, 0.01, 0.1),
n.minobsinnode = c(2, 5, 15))
Creamos un objeto de control
control_train <- trainControl(method = "cv",
number = 10,
returnResamp = "final", verboseIter = FALSE,
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel = TRUE)
Personalizamos la fórmula para aplcar el modelo Gradient Boosting
formula_gbm <- formula
tic()
gbm <- train(formula_gbm,train_gbm,
method = 'gbm',
tuneGrid = hiperparametros,
metric = 'ROC',
trControl = control_train,
distribution = 'adaboost',
verbose = FALSE)
toc()
## 1239.83 sec elapsed
gbm
## Stochastic Gradient Boosting
##
## 4903 samples
## 18 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4413, 4412, 4412, 4413, 4413, 4413, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode n.trees ROC Sens
## 0.001 1 2 500 0.7663104 1.0000000
## 0.001 1 2 1000 0.7894056 1.0000000
## 0.001 1 2 2000 0.8066166 1.0000000
## 0.001 1 5 500 0.7658955 1.0000000
## 0.001 1 5 1000 0.7899465 1.0000000
## 0.001 1 5 2000 0.8065344 1.0000000
## 0.001 1 15 500 0.7692220 1.0000000
## 0.001 1 15 1000 0.7900079 1.0000000
## 0.001 1 15 2000 0.8061254 1.0000000
## 0.001 2 2 500 0.8079353 1.0000000
## 0.001 2 2 1000 0.8132925 1.0000000
## 0.001 2 2 2000 0.8194459 0.9577164
## 0.001 2 5 500 0.8056870 1.0000000
## 0.001 2 5 1000 0.8127927 1.0000000
## 0.001 2 5 2000 0.8195416 0.9579965
## 0.001 2 15 500 0.8061858 1.0000000
## 0.001 2 15 1000 0.8129468 1.0000000
## 0.001 2 15 2000 0.8195627 0.9563150
## 0.010 1 2 500 0.8196642 0.9397963
## 0.010 1 2 1000 0.8280525 0.9140330
## 0.010 1 2 2000 0.8358770 0.8933102
## 0.010 1 5 500 0.8193233 0.9392360
## 0.010 1 5 1000 0.8278892 0.9112319
## 0.010 1 5 2000 0.8363355 0.8930285
## 0.010 1 15 500 0.8194502 0.9397955
## 0.010 1 15 1000 0.8279383 0.9134728
## 0.010 1 15 2000 0.8361987 0.8924698
## 0.010 2 2 500 0.8283268 0.9193536
## 0.010 2 2 1000 0.8361646 0.9022722
## 0.010 2 2 2000 0.8411421 0.8997535
## 0.010 2 5 500 0.8283133 0.9201931
## 0.010 2 5 1000 0.8358940 0.9014311
## 0.010 2 5 2000 0.8406042 0.8986331
## 0.010 2 15 500 0.8284402 0.9199130
## 0.010 2 15 1000 0.8356514 0.9031110
## 0.010 2 15 2000 0.8408300 0.9022738
## 0.100 1 2 500 0.8439061 0.8966676
## 0.100 1 2 1000 0.8457000 0.8994703
## 0.100 1 2 2000 0.8460374 0.8997535
## 0.100 1 5 500 0.8435598 0.8930316
## 0.100 1 5 1000 0.8468278 0.8972317
## 0.100 1 5 2000 0.8460207 0.8989124
## 0.100 1 15 500 0.8438376 0.8944298
## 0.100 1 15 1000 0.8460445 0.8972310
## 0.100 1 15 2000 0.8459388 0.9000329
## 0.100 2 2 500 0.8382156 0.8944298
## 0.100 2 2 1000 0.8364657 0.8913478
## 0.100 2 2 2000 0.8314494 0.8851846
## 0.100 2 5 500 0.8402325 0.8961097
## 0.100 2 5 1000 0.8371165 0.8949885
## 0.100 2 5 2000 0.8306449 0.8879873
## 0.100 2 15 500 0.8410111 0.8977920
## 0.100 2 15 1000 0.8367298 0.8941513
## 0.100 2 15 2000 0.8308783 0.8795862
## Spec
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.0000000
## 0.2529514
## 0.0000000
## 0.0000000
## 0.2461901
## 0.0000000
## 0.0000000
## 0.2499439
## 0.3190214
## 0.4361576
## 0.5022164
## 0.3227640
## 0.4399057
## 0.5059645
## 0.3167546
## 0.4354057
## 0.5067108
## 0.4218943
## 0.4872012
## 0.5209741
## 0.4211424
## 0.4909494
## 0.5209741
## 0.4166480
## 0.4864549
## 0.5164796
## 0.5322579
## 0.5307710
## 0.5330042
## 0.5405061
## 0.5390024
## 0.5375210
## 0.5367636
## 0.5397711
## 0.5322523
## 0.5262316
## 0.5307373
## 0.5239928
## 0.5187297
## 0.5277129
## 0.5254405
## 0.5187297
## 0.5247503
## 0.5262316
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 1000, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 5.
Generamos un resumen del modelo generado
summary(gbm)
## var
## ContractTwo year ContractTwo year
## PaymentMethodElectronic check PaymentMethodElectronic check
## ContractOne year ContractOne year
## MonthlyCharges_DISC04 <= 106.75 MonthlyCharges_DISC04 <= 106.75
## tenure_DISC05 <= 49 tenure_DISC05 <= 49
## InternetService1 InternetService1
## PaperlessBilling1 PaperlessBilling1
## tenure_DISC02 <= 5 tenure_DISC02 <= 5
## tenure_DISC06 <= 59 tenure_DISC06 <= 59
## MonthlyCharges_DISC05 > 106.75 MonthlyCharges_DISC05 > 106.75
## tenure_DISC04 <= 22 tenure_DISC04 <= 22
## tenure_DISC07 <= 70 tenure_DISC07 <= 70
## tenure_DISC03 <= 16 tenure_DISC03 <= 16
## OnlineSecurity1 OnlineSecurity1
## SeniorCitizen1 SeniorCitizen1
## MonthlyCharges_DISC03 <= 68.8 MonthlyCharges_DISC03 <= 68.8
## TechSupport1 TechSupport1
## OnlineBackup1 OnlineBackup1
## StreamingMovies1 StreamingMovies1
## Dependents1 Dependents1
## gender1 gender1
## tenure_DISC08 > 70 tenure_DISC08 > 70
## StreamingTV1 StreamingTV1
## MultipleLines1 MultipleLines1
## PaymentMethodCredit card (automatic) PaymentMethodCredit card (automatic)
## MonthlyCharges_DISC02 <= 55.95 MonthlyCharges_DISC02 <= 55.95
## Partner1 Partner1
## DeviceProtection1 DeviceProtection1
## PhoneService1 PhoneService1
## PaymentMethodMailed check PaymentMethodMailed check
## rel.inf
## ContractTwo year 16.9275344
## PaymentMethodElectronic check 12.2860024
## ContractOne year 7.8329931
## MonthlyCharges_DISC04 <= 106.75 5.8055755
## tenure_DISC05 <= 49 4.7501324
## InternetService1 4.3015432
## PaperlessBilling1 3.4378481
## tenure_DISC02 <= 5 3.4179356
## tenure_DISC06 <= 59 3.2130925
## MonthlyCharges_DISC05 > 106.75 3.1377010
## tenure_DISC04 <= 22 3.1016948
## tenure_DISC07 <= 70 2.8734428
## tenure_DISC03 <= 16 2.6288730
## OnlineSecurity1 2.6116687
## SeniorCitizen1 2.4787892
## MonthlyCharges_DISC03 <= 68.8 2.3410436
## TechSupport1 2.1700446
## OnlineBackup1 1.6351840
## StreamingMovies1 1.5996770
## Dependents1 1.4822653
## gender1 1.4783326
## tenure_DISC08 > 70 1.4380768
## StreamingTV1 1.4309935
## MultipleLines1 1.4293148
## PaymentMethodCredit card (automatic) 1.3921943
## MonthlyCharges_DISC02 <= 55.95 1.3051097
## Partner1 1.2862856
## DeviceProtection1 0.8174539
## PhoneService1 0.7005791
## PaymentMethodMailed check 0.6886184
Construímos un vector de probabilidades aplicando predict al conjunto de test
gbm_predict <- predict(object = gbm,
newdata = test_gbm,
type = "prob")
gbm_predict <- (gbm_predict)[,2]
head(gbm_predict)
## [1] 0.75573464 0.01453469 0.30516716 0.01273036 0.43707954 0.13707004
Graficamos el modelo generado
plot(gbm_predict~test_gbm$TARGET_Churn)
Con la función umbrales probamos diferentes cortes
umb_gbm<-umbrales(test_gbm$TARGET_Churn,gbm_predict)
umb_gbm
## umbral acierto precision cobertura F1
## 1 0.05 52.41898 34.42408 97.951583 50.944310
## 2 0.10 61.10850 38.90160 94.972067 55.194805
## 3 0.15 66.65101 42.41893 90.130354 57.687723
## 4 0.20 69.93894 44.98539 86.033520 59.079284
## 5 0.25 73.36778 48.32215 80.446927 60.377358
## 6 0.30 76.42085 52.24647 75.791434 61.854103
## 7 0.35 77.73603 54.44288 71.880819 61.958266
## 8 0.40 79.56787 58.25243 67.039106 62.337662
## 9 0.45 80.41334 61.15242 61.266294 61.209302
## 10 0.50 80.88304 64.31718 54.376164 58.930373
## 11 0.55 80.69516 66.66667 46.927374 55.081967
## 12 0.60 79.84969 69.14894 36.312849 47.619048
## 13 0.65 79.42696 72.60274 29.608939 42.063492
## 14 0.70 79.05120 80.95238 22.160149 34.795322
## 15 0.75 77.54814 86.41975 13.035382 22.653722
## 16 0.80 76.79662 89.09091 9.124767 16.554054
## 17 0.85 75.48145 82.60870 3.538175 6.785714
## 18 0.90 74.96477 100.00000 0.744879 1.478743
## 19 0.95 0.95000 0.95000 0.950000 0.950000
Seleccionamos automáticamente el mejor umbral
umbral_final_gbm<-umb_gbm[which.max(umb_gbm$F1),1]
umbral_final_gbm
## [1] 0.4
Evaluamos la matriz de confusión y las métricas con el umbral optimizado
confusion(test_gbm$TARGET_Churn,gbm_predict,umbral_final_gbm)
##
## real FALSE TRUE
## No 1334 258
## Yes 177 360
gbm_metricas<-filter(umb_gbm,umbral==umbral_final_gbm)
gbm_metricas
## umbral acierto precision cobertura F1
## 1 0.4 79.56787 58.25243 67.03911 62.33766
Evaluamos la ROC
#creamos el objeto prediction
gbm_prediction<-prediction(gbm_predict,test_gbm$TARGET_Churn)
#visualizamos la ROC
roc(gbm_prediction)
Sacamos las métricas definitivas incluyendo el AUC
gbm_metricas<-cbind(gbm_metricas,AUC=round(auc(gbm_prediction),2)*100)
print(t(gbm_metricas))
## [,1]
## umbral 0.40000
## acierto 79.56787
## precision 58.25243
## cobertura 67.03911
## F1 62.33766
## AUC 85.00000
4.3.8 - Redes Neuronales con caret y ajuste de hiperparámetros
df_nnet <- df
glimpse(df_nnet)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ OnlineSecurity <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ DeviceProtection <fct> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ TechSupport <fct> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, B...
## $ tenure_DISC <fct> 01 <= 1, 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, ...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 03 <= 68.8, 02 <= 55.95, 02 <= 55....
## $ TARGET_Churn <fct> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,...
df_nnet <- df_nnet %>%
mutate(TARGET_Churn = ifelse(TARGET_Churn == 0, "No", "Yes"))
df_nnet$TARGET_Churn <- as.factor(df_nnet$TARGET_Churn)
glimpse(df_nnet)
## Rows: 7,032
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ OnlineSecurity <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ DeviceProtection <fct> 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ TechSupport <fct> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, B...
## $ tenure_DISC <fct> 01 <= 1, 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, ...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 03 <= 68.8, 02 <= 55.95, 02 <= 55....
## $ TARGET_Churn <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, No,...
train_nnet <- train %>% select(-random)
glimpse(train_nnet)
## Rows: 4,903
## Columns: 20
## $ customerID <chr> "5575-GNVDE", "3668-QPYBK", "7795-CFOCW", "9237...
## $ gender <fct> 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ Partner <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,...
## $ PhoneService <fct> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ MultipleLines <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,...
## $ OnlineBackup <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,...
## $ DeviceProtection <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,...
## $ StreamingTV <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,...
## $ Contract <fct> One year, Month-to-month, One year, Month-to-mo...
## $ PaperlessBilling <fct> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,...
## $ PaymentMethod <fct> Mailed check, Mailed check, Bank transfer (auto...
## $ tenure_DISC <fct> 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, 03 <= 16,...
## $ MonthlyCharges_DISC <fct> 03 <= 68.8, 02 <= 55.95, 02 <= 55.95, 04 <= 106...
## $ TARGET_Churn <fct> 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,...
train_nnet <- train_nnet %>%
mutate(TARGET_Churn = ifelse(TARGET_Churn == 0, "No", "Yes"))
train_nnet$TARGET_Churn <- as.factor(train_nnet$TARGET_Churn)
glimpse(train_nnet)
## Rows: 4,903
## Columns: 20
## $ customerID <chr> "5575-GNVDE", "3668-QPYBK", "7795-CFOCW", "9237...
## $ gender <fct> 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
## $ Partner <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,...
## $ Dependents <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,...
## $ PhoneService <fct> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ MultipleLines <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ InternetService <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,...
## $ OnlineBackup <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,...
## $ DeviceProtection <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,...
## $ StreamingTV <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ StreamingMovies <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,...
## $ Contract <fct> One year, Month-to-month, One year, Month-to-mo...
## $ PaperlessBilling <fct> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,...
## $ PaymentMethod <fct> Mailed check, Mailed check, Bank transfer (auto...
## $ tenure_DISC <fct> 05 <= 49, 02 <= 5, 05 <= 49, 02 <= 5, 03 <= 16,...
## $ MonthlyCharges_DISC <fct> 03 <= 68.8, 02 <= 55.95, 02 <= 55.95, 04 <= 106...
## $ TARGET_Churn <fct> No, Yes, No, Yes, Yes, No, No, Yes, No, No, No,...
test_nnet <- test %>% select(-random)
glimpse(test_nnet)
## Rows: 2,129
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "7469-LKBCI", "5129-JLPIS", "9959...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,...
## $ Dependents <fct> 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,...
## $ InternetService <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,...
## $ DeviceProtection <fct> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ StreamingMovies <fct> 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...
## $ Contract <fct> Month-to-month, Two year, Month-to-month, Two y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,...
## $ PaymentMethod <fct> Electronic check, Credit card (automatic), Elec...
## $ tenure_DISC <fct> 01 <= 1, 03 <= 16, 05 <= 49, 08 > 70, 04 <= 22,...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 01 <= 26.9, 04 <= 106.75, 04 <= 10...
## $ TARGET_Churn <fct> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,...
test_nnet <- test_nnet %>%
mutate(TARGET_Churn = ifelse(TARGET_Churn == 0, "No", "Yes"))
test_nnet$TARGET_Churn <- as.factor(test_nnet$TARGET_Churn)
glimpse(test_nnet)
## Rows: 2,129
## Columns: 20
## $ customerID <chr> "7590-VHVEG", "7469-LKBCI", "5129-JLPIS", "9959...
## $ gender <fct> 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,...
## $ SeniorCitizen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <fct> 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,...
## $ Dependents <fct> 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,...
## $ PhoneService <fct> 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ MultipleLines <fct> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,...
## $ InternetService <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,...
## $ OnlineSecurity <fct> 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ OnlineBackup <fct> 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,...
## $ DeviceProtection <fct> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,...
## $ TechSupport <fct> 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,...
## $ StreamingTV <fct> 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ StreamingMovies <fct> 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...
## $ Contract <fct> Month-to-month, Two year, Month-to-month, Two y...
## $ PaperlessBilling <fct> 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,...
## $ PaymentMethod <fct> Electronic check, Credit card (automatic), Elec...
## $ tenure_DISC <fct> 01 <= 1, 03 <= 16, 05 <= 49, 08 > 70, 04 <= 22,...
## $ MonthlyCharges_DISC <fct> 02 <= 55.95, 01 <= 26.9, 04 <= 106.75, 04 <= 10...
## $ TARGET_Churn <fct> No, No, No, No, No, No, Yes, Yes, No, No, No, Y...
Creamos un objeto con los hiperparámetros a ajustar
hiperparametros <- expand.grid(size = c(10, 30, 50, 75, 100, 120),
decay = c(0.0001, 0.01, 0.5))
Creamos un objeto de control para modelar con caret
control_train <- trainControl(method = "cv",
number = 10,
returnResamp = "final", verboseIter = FALSE,
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel = TRUE)
Personalizamos la fórmula para aplcar el modelo de redes neuronales
formula_nnet <- formula
tic()
nnet <- train(formula_nnet,train_nnet,
method = 'nnet',
tuneGrid = hiperparametros,
metric = 'ROC',
trControl = control_train,
rang = c(-1, 1),
MaxNWts = 2000,
trace = FALSE)
toc()
## 607 sec elapsed
nnet
## Neural Network
##
## 4903 samples
## 18 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4413, 4411, 4413, 4413, 4413, 4413, ...
## Resampling results across tuning parameters:
##
## size decay ROC Sens Spec
## 10 0.0001 0.7988840 0.8496276 0.5316238
## 10 0.0100 0.7965562 0.8504718 0.5150432
## 10 0.5000 0.8280139 0.8910748 0.4954831
## 30 0.0001 0.7657812 0.8297529 0.5037482
## 30 0.0100 0.7671956 0.8294642 0.5067613
## 30 0.5000 0.8291737 0.8846353 0.5097520
## 50 0.0001 0.7630776 0.8238604 0.5030019
## 50 0.0100 0.7684666 0.8389817 0.5127595
## 50 0.5000 0.8335794 0.8949932 0.4993042
## 75 0.0001 NaN NaN NaN
## 75 0.0100 NaN NaN NaN
## 75 0.5000 NaN NaN NaN
## 100 0.0001 NaN NaN NaN
## 100 0.0100 NaN NaN NaN
## 100 0.5000 NaN NaN NaN
## 120 0.0001 NaN NaN NaN
## 120 0.0100 NaN NaN NaN
## 120 0.5000 NaN NaN NaN
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 50 and decay = 0.5.
ggplot(nnet, highlight = TRUE) +
labs(title = "Evolución del ROC del modelo NNET") +
theme_bw()
Generamos un resumen del modelo generado
summary(nnet)
## a 30-50-1 network with 1601 weights
## options were - entropy fitting decay=0.5
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## -0.11 -0.09 0.09 -0.01 0.10 -0.10 0.11 0.10 0.29 -0.06
## i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 i18->h1 i19->h1
## 0.11 0.23 0.03 -0.02 0.10 -0.14 0.09 0.24 -0.10 -0.04
## i20->h1 i21->h1 i22->h1 i23->h1 i24->h1 i25->h1 i26->h1 i27->h1 i28->h1 i29->h1
## -0.07 -0.22 0.05 -0.32 -0.24 -0.11 -0.37 -0.03 0.04 -0.14
## i30->h1
## 0.16
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## -0.17 0.21 0.12 0.01 0.23 0.10 -0.05 0.01 0.30 -0.22
## i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 i18->h2 i19->h2
## -0.26 0.24 -0.31 -0.13 -0.04 -0.24 -0.27 0.27 -0.29 0.01
## i20->h2 i21->h2 i22->h2 i23->h2 i24->h2 i25->h2 i26->h2 i27->h2 i28->h2 i29->h2
## -0.20 0.00 0.08 0.08 -0.40 -0.42 -0.35 -0.16 0.12 0.02
## i30->h2
## -0.04
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## -0.12 -0.07 0.10 0.00 0.11 -0.12 0.12 0.11 0.30 -0.05
## i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 i18->h3 i19->h3
## 0.12 0.24 0.04 -0.02 0.10 -0.15 0.11 0.24 -0.10 -0.05
## i20->h3 i21->h3 i22->h3 i23->h3 i24->h3 i25->h3 i26->h3 i27->h3 i28->h3 i29->h3
## -0.07 -0.22 0.05 -0.34 -0.23 -0.10 -0.37 -0.03 0.04 -0.14
## i30->h3
## 0.17
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## 0.10 0.18 0.10 0.05 0.26 -0.19 0.33 0.37 0.44 0.47
## i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 i18->h4 i19->h4
## -0.02 0.63 -0.23 0.10 -0.09 -0.11 0.31 -0.02 -0.21 0.26
## i20->h4 i21->h4 i22->h4 i23->h4 i24->h4 i25->h4 i26->h4 i27->h4 i28->h4 i29->h4
## -0.35 0.14 0.03 -0.62 -0.08 0.21 -0.29 0.36 -0.03 -0.21
## i30->h4
## 0.19
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## 0.01 -0.11 0.02 -0.06 0.02 0.11 0.13 0.10 0.32 -0.08
## i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 i18->h5 i19->h5
## -0.04 0.16 -0.03 -0.02 0.08 -0.12 -0.05 0.16 -0.19 0.10
## i20->h5 i21->h5 i22->h5 i23->h5 i24->h5 i25->h5 i26->h5 i27->h5 i28->h5 i29->h5
## -0.08 -0.25 0.02 -0.10 -0.34 -0.14 -0.37 -0.05 -0.01 -0.01
## i30->h5
## 0.13
## b->h6 i1->h6 i2->h6 i3->h6 i4->h6 i5->h6 i6->h6 i7->h6 i8->h6 i9->h6
## 0.01 -0.11 0.02 -0.06 0.02 0.11 0.13 0.10 0.32 -0.08
## i10->h6 i11->h6 i12->h6 i13->h6 i14->h6 i15->h6 i16->h6 i17->h6 i18->h6 i19->h6
## -0.04 0.16 -0.04 -0.03 0.08 -0.12 -0.05 0.16 -0.19 0.10
## i20->h6 i21->h6 i22->h6 i23->h6 i24->h6 i25->h6 i26->h6 i27->h6 i28->h6 i29->h6
## -0.08 -0.25 0.02 -0.10 -0.35 -0.14 -0.38 -0.05 -0.01 -0.01
## i30->h6
## 0.12
## b->h7 i1->h7 i2->h7 i3->h7 i4->h7 i5->h7 i6->h7 i7->h7 i8->h7 i9->h7
## -0.29 -0.13 -0.03 -0.03 -0.02 -0.17 -0.30 -0.23 0.03 -0.06
## i10->h7 i11->h7 i12->h7 i13->h7 i14->h7 i15->h7 i16->h7 i17->h7 i18->h7 i19->h7
## -0.10 -0.05 -0.10 -0.05 0.07 0.06 -0.29 -0.01 -0.20 0.01
## i20->h7 i21->h7 i22->h7 i23->h7 i24->h7 i25->h7 i26->h7 i27->h7 i28->h7 i29->h7
## 0.10 -0.05 -0.08 0.03 -0.01 -0.01 0.08 -0.15 0.10 -0.12
## i30->h7
## -0.05
## b->h8 i1->h8 i2->h8 i3->h8 i4->h8 i5->h8 i6->h8 i7->h8 i8->h8 i9->h8
## -0.02 0.10 0.05 0.06 0.05 0.11 -0.05 0.04 0.26 -0.13
## i10->h8 i11->h8 i12->h8 i13->h8 i14->h8 i15->h8 i16->h8 i17->h8 i18->h8 i19->h8
## 0.07 0.08 -0.06 -0.10 0.04 -0.05 -0.15 0.21 -0.25 0.09
## i20->h8 i21->h8 i22->h8 i23->h8 i24->h8 i25->h8 i26->h8 i27->h8 i28->h8 i29->h8
## -0.08 -0.20 -0.02 0.05 -0.25 -0.13 -0.24 -0.06 0.00 0.09
## i30->h8
## 0.00
## b->h9 i1->h9 i2->h9 i3->h9 i4->h9 i5->h9 i6->h9 i7->h9 i8->h9 i9->h9
## -0.03 0.02 0.06 0.06 -0.05 0.13 0.10 0.10 0.29 -0.19
## i10->h9 i11->h9 i12->h9 i13->h9 i14->h9 i15->h9 i16->h9 i17->h9 i18->h9 i19->h9
## 0.08 0.10 0.02 -0.05 0.05 -0.07 -0.13 0.24 -0.19 -0.01
## i20->h9 i21->h9 i22->h9 i23->h9 i24->h9 i25->h9 i26->h9 i27->h9 i28->h9 i29->h9
## -0.08 -0.22 -0.02 0.04 -0.31 -0.25 -0.34 -0.08 -0.04 0.15
## i30->h9
## 0.04
## b->h10 i1->h10 i2->h10 i3->h10 i4->h10 i5->h10 i6->h10 i7->h10
## -0.02 0.10 0.05 0.06 0.05 0.11 -0.05 0.04
## i8->h10 i9->h10 i10->h10 i11->h10 i12->h10 i13->h10 i14->h10 i15->h10
## 0.26 -0.13 0.07 0.08 -0.06 -0.10 0.04 -0.05
## i16->h10 i17->h10 i18->h10 i19->h10 i20->h10 i21->h10 i22->h10 i23->h10
## -0.15 0.21 -0.25 0.09 -0.08 -0.20 -0.02 0.05
## i24->h10 i25->h10 i26->h10 i27->h10 i28->h10 i29->h10 i30->h10
## -0.25 -0.13 -0.24 -0.06 0.00 0.09 0.00
## b->h11 i1->h11 i2->h11 i3->h11 i4->h11 i5->h11 i6->h11 i7->h11
## -0.08 0.12 -0.01 0.04 0.11 0.07 -0.20 -0.03
## i8->h11 i9->h11 i10->h11 i11->h11 i12->h11 i13->h11 i14->h11 i15->h11
## 0.23 -0.07 0.09 0.09 -0.10 -0.13 0.11 -0.04
## i16->h11 i17->h11 i18->h11 i19->h11 i20->h11 i21->h11 i22->h11 i23->h11
## -0.18 0.14 -0.34 0.15 -0.10 -0.24 -0.02 0.02
## i24->h11 i25->h11 i26->h11 i27->h11 i28->h11 i29->h11 i30->h11
## -0.23 -0.10 -0.21 -0.12 0.06 0.04 -0.01
## b->h12 i1->h12 i2->h12 i3->h12 i4->h12 i5->h12 i6->h12 i7->h12
## -0.39 -0.11 -0.08 -0.06 -0.06 -0.17 -0.38 -0.25
## i8->h12 i9->h12 i10->h12 i11->h12 i12->h12 i13->h12 i14->h12 i15->h12
## -0.07 -0.03 -0.10 0.02 -0.14 -0.11 0.08 0.00
## i16->h12 i17->h12 i18->h12 i19->h12 i20->h12 i21->h12 i22->h12 i23->h12
## -0.30 -0.07 -0.26 -0.11 0.02 -0.15 -0.16 -0.12
## i24->h12 i25->h12 i26->h12 i27->h12 i28->h12 i29->h12 i30->h12
## -0.07 -0.08 -0.08 -0.12 0.17 -0.23 -0.02
## b->h13 i1->h13 i2->h13 i3->h13 i4->h13 i5->h13 i6->h13 i7->h13
## 0.07 0.07 -0.12 0.12 0.11 0.07 -0.15 -0.17
## i8->h13 i9->h13 i10->h13 i11->h13 i12->h13 i13->h13 i14->h13 i15->h13
## 0.01 0.16 0.01 -0.13 -0.30 -0.18 -0.07 0.20
## i16->h13 i17->h13 i18->h13 i19->h13 i20->h13 i21->h13 i22->h13 i23->h13
## 0.12 0.06 -0.29 0.27 -0.02 0.12 0.00 0.01
## i24->h13 i25->h13 i26->h13 i27->h13 i28->h13 i29->h13 i30->h13
## 0.37 0.31 0.43 0.03 0.06 -0.03 -0.13
## b->h14 i1->h14 i2->h14 i3->h14 i4->h14 i5->h14 i6->h14 i7->h14
## -0.01 -0.61 0.02 0.32 1.24 0.72 0.55 -0.37
## i8->h14 i9->h14 i10->h14 i11->h14 i12->h14 i13->h14 i14->h14 i15->h14
## 0.58 -0.45 0.39 0.08 0.23 0.64 0.44 0.37
## i16->h14 i17->h14 i18->h14 i19->h14 i20->h14 i21->h14 i22->h14 i23->h14
## -0.26 -0.11 -0.09 -0.09 0.17 -1.66 0.88 -0.07
## i24->h14 i25->h14 i26->h14 i27->h14 i28->h14 i29->h14 i30->h14
## 0.63 0.92 0.45 -0.23 0.74 -0.05 -0.54
## b->h15 i1->h15 i2->h15 i3->h15 i4->h15 i5->h15 i6->h15 i7->h15
## -0.01 0.15 -0.17 0.13 0.06 0.03 -0.23 -0.18
## i8->h15 i9->h15 i10->h15 i11->h15 i12->h15 i13->h15 i14->h15 i15->h15
## 0.08 0.07 0.03 -0.12 -0.23 -0.20 0.05 0.20
## i16->h15 i17->h15 i18->h15 i19->h15 i20->h15 i21->h15 i22->h15 i23->h15
## 0.07 0.08 -0.25 0.17 0.01 0.08 -0.08 -0.01
## i24->h15 i25->h15 i26->h15 i27->h15 i28->h15 i29->h15 i30->h15
## 0.24 0.19 0.32 -0.04 -0.01 0.11 -0.14
## b->h16 i1->h16 i2->h16 i3->h16 i4->h16 i5->h16 i6->h16 i7->h16
## -0.11 -0.04 0.01 0.03 -0.02 -0.08 -0.17 -0.17
## i8->h16 i9->h16 i10->h16 i11->h16 i12->h16 i13->h16 i14->h16 i15->h16
## 0.14 0.13 -0.09 -0.10 -0.21 -0.12 0.04 0.17
## i16->h16 i17->h16 i18->h16 i19->h16 i20->h16 i21->h16 i22->h16 i23->h16
## -0.19 -0.04 -0.25 0.24 0.13 -0.02 -0.02 0.05
## i24->h16 i25->h16 i26->h16 i27->h16 i28->h16 i29->h16 i30->h16
## 0.18 0.24 0.33 -0.07 0.02 -0.02 -0.04
## b->h17 i1->h17 i2->h17 i3->h17 i4->h17 i5->h17 i6->h17 i7->h17
## -0.07 -0.44 -0.29 0.25 0.06 0.12 -0.13 -0.12
## i8->h17 i9->h17 i10->h17 i11->h17 i12->h17 i13->h17 i14->h17 i15->h17
## 0.17 0.29 0.11 -0.20 0.21 0.08 0.06 0.39
## i16->h17 i17->h17 i18->h17 i19->h17 i20->h17 i21->h17 i22->h17 i23->h17
## -0.19 0.00 -0.17 -0.29 0.07 -0.30 0.08 -0.48
## i24->h17 i25->h17 i26->h17 i27->h17 i28->h17 i29->h17 i30->h17
## 0.33 0.43 0.18 -0.38 0.02 0.16 0.10
## b->h18 i1->h18 i2->h18 i3->h18 i4->h18 i5->h18 i6->h18 i7->h18
## -0.02 0.15 -0.17 0.13 0.06 0.03 -0.23 -0.18
## i8->h18 i9->h18 i10->h18 i11->h18 i12->h18 i13->h18 i14->h18 i15->h18
## 0.08 0.07 0.03 -0.12 -0.23 -0.20 0.05 0.20
## i16->h18 i17->h18 i18->h18 i19->h18 i20->h18 i21->h18 i22->h18 i23->h18
## 0.07 0.08 -0.25 0.17 0.01 0.08 -0.08 -0.01
## i24->h18 i25->h18 i26->h18 i27->h18 i28->h18 i29->h18 i30->h18
## 0.24 0.19 0.31 -0.04 -0.01 0.11 -0.14
## b->h19 i1->h19 i2->h19 i3->h19 i4->h19 i5->h19 i6->h19 i7->h19
## 0.05 -0.06 -0.13 0.17 0.12 0.05 -0.14 -0.07
## i8->h19 i9->h19 i10->h19 i11->h19 i12->h19 i13->h19 i14->h19 i15->h19
## 0.07 0.11 0.13 -0.05 -0.29 -0.15 -0.11 0.22
## i16->h19 i17->h19 i18->h19 i19->h19 i20->h19 i21->h19 i22->h19 i23->h19
## 0.00 0.08 -0.26 0.18 0.09 0.13 -0.01 -0.09
## i24->h19 i25->h19 i26->h19 i27->h19 i28->h19 i29->h19 i30->h19
## 0.32 0.25 0.34 0.06 0.12 -0.08 -0.09
## b->h20 i1->h20 i2->h20 i3->h20 i4->h20 i5->h20 i6->h20 i7->h20
## -0.08 -0.02 -0.04 0.08 0.03 0.01 -0.25 -0.07
## i8->h20 i9->h20 i10->h20 i11->h20 i12->h20 i13->h20 i14->h20 i15->h20
## 0.08 0.10 -0.01 -0.06 -0.13 -0.06 0.00 0.13
## i16->h20 i17->h20 i18->h20 i19->h20 i20->h20 i21->h20 i22->h20 i23->h20
## -0.15 -0.02 -0.18 0.10 0.12 -0.04 -0.08 -0.05
## i24->h20 i25->h20 i26->h20 i27->h20 i28->h20 i29->h20 i30->h20
## 0.09 0.09 0.16 -0.02 0.10 -0.03 -0.05
## b->h21 i1->h21 i2->h21 i3->h21 i4->h21 i5->h21 i6->h21 i7->h21
## -0.11 -0.05 0.01 0.03 -0.03 -0.10 -0.16 -0.18
## i8->h21 i9->h21 i10->h21 i11->h21 i12->h21 i13->h21 i14->h21 i15->h21
## 0.14 0.13 -0.09 -0.10 -0.22 -0.13 0.05 0.17
## i16->h21 i17->h21 i18->h21 i19->h21 i20->h21 i21->h21 i22->h21 i23->h21
## -0.19 -0.04 -0.26 0.25 0.12 -0.01 -0.01 0.05
## i24->h21 i25->h21 i26->h21 i27->h21 i28->h21 i29->h21 i30->h21
## 0.18 0.25 0.34 -0.08 0.01 -0.01 -0.04
## b->h22 i1->h22 i2->h22 i3->h22 i4->h22 i5->h22 i6->h22 i7->h22
## -0.34 0.83 1.41 0.93 -0.03 0.09 -0.38 -0.10
## i8->h22 i9->h22 i10->h22 i11->h22 i12->h22 i13->h22 i14->h22 i15->h22
## 0.76 0.93 -0.11 1.24 -0.85 0.60 0.12 0.01
## i16->h22 i17->h22 i18->h22 i19->h22 i20->h22 i21->h22 i22->h22 i23->h22
## -0.40 1.13 -0.51 0.20 1.54 0.46 -0.66 1.32
## i24->h22 i25->h22 i26->h22 i27->h22 i28->h22 i29->h22 i30->h22
## 0.03 -0.04 -0.25 -0.25 0.89 -0.89 -0.43
## b->h23 i1->h23 i2->h23 i3->h23 i4->h23 i5->h23 i6->h23 i7->h23
## 0.07 -0.29 -0.10 -0.67 0.56 0.46 -1.25 0.52
## i8->h23 i9->h23 i10->h23 i11->h23 i12->h23 i13->h23 i14->h23 i15->h23
## 1.05 0.88 -0.23 0.63 0.27 -1.54 0.31 -0.01
## i16->h23 i17->h23 i18->h23 i19->h23 i20->h23 i21->h23 i22->h23 i23->h23
## -0.10 1.19 1.28 -0.43 -0.15 0.22 1.21 0.57
## i24->h23 i25->h23 i26->h23 i27->h23 i28->h23 i29->h23 i30->h23
## 0.27 0.95 0.21 0.36 0.21 0.28 -0.28
## b->h24 i1->h24 i2->h24 i3->h24 i4->h24 i5->h24 i6->h24 i7->h24
## -0.11 -0.22 -0.28 0.22 -0.09 0.03 -0.23 -0.10
## i8->h24 i9->h24 i10->h24 i11->h24 i12->h24 i13->h24 i14->h24 i15->h24
## 0.19 0.20 0.08 -0.17 0.08 -0.05 0.06 0.31
## i16->h24 i17->h24 i18->h24 i19->h24 i20->h24 i21->h24 i22->h24 i23->h24
## -0.02 0.04 -0.16 -0.22 0.09 -0.06 -0.15 -0.35
## i24->h24 i25->h24 i26->h24 i27->h24 i28->h24 i29->h24 i30->h24
## 0.16 0.17 0.11 -0.37 -0.07 0.23 0.11
## b->h25 i1->h25 i2->h25 i3->h25 i4->h25 i5->h25 i6->h25 i7->h25
## -0.61 0.28 -0.47 -0.99 -0.20 0.85 1.11 0.15
## i8->h25 i9->h25 i10->h25 i11->h25 i12->h25 i13->h25 i14->h25 i15->h25
## 0.89 -0.63 -0.34 1.96 0.40 -0.14 1.07 0.93
## i16->h25 i17->h25 i18->h25 i19->h25 i20->h25 i21->h25 i22->h25 i23->h25
## -0.05 -0.44 0.08 -0.59 -1.53 1.45 1.91 0.42
## i24->h25 i25->h25 i26->h25 i27->h25 i28->h25 i29->h25 i30->h25
## 0.35 0.91 0.61 0.45 0.48 -1.54 1.06
## b->h26 i1->h26 i2->h26 i3->h26 i4->h26 i5->h26 i6->h26 i7->h26
## -0.10 -0.25 0.19 0.93 1.26 0.77 -0.45 -0.16
## i8->h26 i9->h26 i10->h26 i11->h26 i12->h26 i13->h26 i14->h26 i15->h26
## -0.23 0.40 0.33 0.26 0.03 0.25 0.59 0.06
## i16->h26 i17->h26 i18->h26 i19->h26 i20->h26 i21->h26 i22->h26 i23->h26
## -0.22 -0.22 -0.40 1.04 0.10 -0.53 0.62 0.17
## i24->h26 i25->h26 i26->h26 i27->h26 i28->h26 i29->h26 i30->h26
## -0.24 0.54 0.40 0.17 0.74 -0.36 -0.34
## b->h27 i1->h27 i2->h27 i3->h27 i4->h27 i5->h27 i6->h27 i7->h27
## 0.02 -0.07 -0.11 0.17 0.11 0.04 -0.16 -0.05
## i8->h27 i9->h27 i10->h27 i11->h27 i12->h27 i13->h27 i14->h27 i15->h27
## 0.08 0.08 0.13 -0.05 -0.26 -0.13 -0.09 0.20
## i16->h27 i17->h27 i18->h27 i19->h27 i20->h27 i21->h27 i22->h27 i23->h27
## -0.05 0.06 -0.24 0.16 0.11 0.09 -0.02 -0.11
## i24->h27 i25->h27 i26->h27 i27->h27 i28->h27 i29->h27 i30->h27
## 0.26 0.21 0.28 0.04 0.13 -0.06 -0.08
## b->h28 i1->h28 i2->h28 i3->h28 i4->h28 i5->h28 i6->h28 i7->h28
## -0.20 -0.24 -0.19 0.01 -0.10 -0.48 -0.03 -0.40
## i8->h28 i9->h28 i10->h28 i11->h28 i12->h28 i13->h28 i14->h28 i15->h28
## 0.27 -0.22 0.31 -0.11 -0.22 -0.26 0.12 0.23
## i16->h28 i17->h28 i18->h28 i19->h28 i20->h28 i21->h28 i22->h28 i23->h28
## -0.10 0.23 -0.61 0.53 -0.06 0.10 0.35 0.26
## i24->h28 i25->h28 i26->h28 i27->h28 i28->h28 i29->h28 i30->h28
## 0.35 0.27 0.62 -0.43 -0.39 0.28 0.02
## b->h29 i1->h29 i2->h29 i3->h29 i4->h29 i5->h29 i6->h29 i7->h29
## -0.21 -0.29 -0.22 0.02 -0.09 -0.50 -0.08 -0.41
## i8->h29 i9->h29 i10->h29 i11->h29 i12->h29 i13->h29 i14->h29 i15->h29
## 0.25 -0.28 0.34 -0.13 -0.17 -0.23 0.11 0.24
## i16->h29 i17->h29 i18->h29 i19->h29 i20->h29 i21->h29 i22->h29 i23->h29
## -0.11 0.28 -0.60 0.50 -0.06 0.11 0.35 0.31
## i24->h29 i25->h29 i26->h29 i27->h29 i28->h29 i29->h29 i30->h29
## 0.33 0.20 0.61 -0.46 -0.39 0.30 0.02
## b->h30 i1->h30 i2->h30 i3->h30 i4->h30 i5->h30 i6->h30 i7->h30
## -0.15 0.13 -0.06 0.02 0.12 0.03 -0.30 -0.09
## i8->h30 i9->h30 i10->h30 i11->h30 i12->h30 i13->h30 i14->h30 i15->h30
## 0.20 -0.03 0.09 0.10 -0.11 -0.16 0.14 -0.06
## i16->h30 i17->h30 i18->h30 i19->h30 i20->h30 i21->h30 i22->h30 i23->h30
## -0.21 0.09 -0.40 0.15 -0.14 -0.28 -0.04 -0.02
## i24->h30 i25->h30 i26->h30 i27->h30 i28->h30 i29->h30 i30->h30
## -0.23 -0.12 -0.22 -0.17 0.09 0.01 0.00
## b->h31 i1->h31 i2->h31 i3->h31 i4->h31 i5->h31 i6->h31 i7->h31
## -0.30 -0.13 -0.03 -0.04 -0.02 -0.16 -0.30 -0.23
## i8->h31 i9->h31 i10->h31 i11->h31 i12->h31 i13->h31 i14->h31 i15->h31
## 0.02 -0.06 -0.10 -0.04 -0.10 -0.05 0.07 0.05
## i16->h31 i17->h31 i18->h31 i19->h31 i20->h31 i21->h31 i22->h31 i23->h31
## -0.29 -0.01 -0.20 0.00 0.09 -0.06 -0.09 0.01
## i24->h31 i25->h31 i26->h31 i27->h31 i28->h31 i29->h31 i30->h31
## -0.02 -0.02 0.06 -0.14 0.11 -0.13 -0.05
## b->h32 i1->h32 i2->h32 i3->h32 i4->h32 i5->h32 i6->h32 i7->h32
## -0.09 -0.03 0.00 0.04 -0.01 -0.04 -0.21 -0.13
## i8->h32 i9->h32 i10->h32 i11->h32 i12->h32 i13->h32 i14->h32 i15->h32
## 0.11 0.13 -0.08 -0.09 -0.17 -0.09 0.03 0.16
## i16->h32 i17->h32 i18->h32 i19->h32 i20->h32 i21->h32 i22->h32 i23->h32
## -0.18 -0.04 -0.21 0.18 0.13 -0.02 -0.06 0.02
## i24->h32 i25->h32 i26->h32 i27->h32 i28->h32 i29->h32 i30->h32
## 0.14 0.19 0.27 -0.04 0.06 -0.03 -0.05
## b->h33 i1->h33 i2->h33 i3->h33 i4->h33 i5->h33 i6->h33 i7->h33
## -0.16 -0.11 -0.06 0.01 -0.09 -0.34 -0.01 -0.34
## i8->h33 i9->h33 i10->h33 i11->h33 i12->h33 i13->h33 i14->h33 i15->h33
## 0.27 -0.01 0.11 -0.10 -0.30 -0.25 0.11 0.21
## i16->h33 i17->h33 i18->h33 i19->h33 i20->h33 i21->h33 i22->h33 i23->h33
## -0.14 0.07 -0.49 0.49 0.02 0.04 0.21 0.14
## i24->h33 i25->h33 i26->h33 i27->h33 i28->h33 i29->h33 i30->h33
## 0.32 0.37 0.57 -0.28 -0.24 0.14 -0.01
## b->h34 i1->h34 i2->h34 i3->h34 i4->h34 i5->h34 i6->h34 i7->h34
## -0.17 -0.14 -0.09 0.00 -0.10 -0.39 0.00 -0.37
## i8->h34 i9->h34 i10->h34 i11->h34 i12->h34 i13->h34 i14->h34 i15->h34
## 0.28 -0.08 0.18 -0.10 -0.29 -0.27 0.12 0.22
## i16->h34 i17->h34 i18->h34 i19->h34 i20->h34 i21->h34 i22->h34 i23->h34
## -0.12 0.12 -0.54 0.51 -0.01 0.06 0.27 0.17
## i24->h34 i25->h34 i26->h34 i27->h34 i28->h34 i29->h34 i30->h34
## 0.34 0.35 0.60 -0.33 -0.30 0.19 0.00
## b->h35 i1->h35 i2->h35 i3->h35 i4->h35 i5->h35 i6->h35 i7->h35
## -0.19 -0.20 -0.15 0.01 -0.10 -0.45 -0.01 -0.39
## i8->h35 i9->h35 i10->h35 i11->h35 i12->h35 i13->h35 i14->h35 i15->h35
## 0.28 -0.17 0.27 -0.10 -0.25 -0.27 0.13 0.23
## i16->h35 i17->h35 i18->h35 i19->h35 i20->h35 i21->h35 i22->h35 i23->h35
## -0.10 0.19 -0.59 0.53 -0.05 0.09 0.32 0.23
## i24->h35 i25->h35 i26->h35 i27->h35 i28->h35 i29->h35 i30->h35
## 0.36 0.31 0.62 -0.39 -0.37 0.25 0.01
## b->h36 i1->h36 i2->h36 i3->h36 i4->h36 i5->h36 i6->h36 i7->h36
## -0.24 -0.17 -0.05 0.00 -0.02 -0.22 -0.26 -0.25
## i8->h36 i9->h36 i10->h36 i11->h36 i12->h36 i13->h36 i14->h36 i15->h36
## 0.10 -0.12 -0.01 -0.11 -0.08 -0.06 0.07 0.12
## i16->h36 i17->h36 i18->h36 i19->h36 i20->h36 i21->h36 i22->h36 i23->h36
## -0.27 0.08 -0.24 0.14 0.10 0.00 0.01 0.16
## i24->h36 i25->h36 i26->h36 i27->h36 i28->h36 i29->h36 i30->h36
## 0.06 0.03 0.23 -0.22 0.00 0.00 -0.05
## b->h37 i1->h37 i2->h37 i3->h37 i4->h37 i5->h37 i6->h37 i7->h37
## 0.07 0.08 -0.12 0.12 0.11 0.08 -0.15 -0.17
## i8->h37 i9->h37 i10->h37 i11->h37 i12->h37 i13->h37 i14->h37 i15->h37
## 0.00 0.16 0.00 -0.13 -0.30 -0.18 -0.06 0.20
## i16->h37 i17->h37 i18->h37 i19->h37 i20->h37 i21->h37 i22->h37 i23->h37
## 0.12 0.06 -0.29 0.27 -0.03 0.12 -0.01 0.02
## i24->h37 i25->h37 i26->h37 i27->h37 i28->h37 i29->h37 i30->h37
## 0.37 0.31 0.43 0.03 0.06 -0.02 -0.13
## b->h38 i1->h38 i2->h38 i3->h38 i4->h38 i5->h38 i6->h38 i7->h38
## -0.07 0.12 0.00 0.05 0.10 0.07 -0.18 -0.02
## i8->h38 i9->h38 i10->h38 i11->h38 i12->h38 i13->h38 i14->h38 i15->h38
## 0.24 -0.08 0.08 0.09 -0.09 -0.13 0.10 -0.04
## i16->h38 i17->h38 i18->h38 i19->h38 i20->h38 i21->h38 i22->h38 i23->h38
## -0.18 0.15 -0.33 0.15 -0.10 -0.23 -0.02 0.02
## i24->h38 i25->h38 i26->h38 i27->h38 i28->h38 i29->h38 i30->h38
## -0.23 -0.10 -0.21 -0.11 0.05 0.05 -0.01
## b->h39 i1->h39 i2->h39 i3->h39 i4->h39 i5->h39 i6->h39 i7->h39
## -0.03 -0.04 0.05 0.04 -0.09 0.13 0.16 0.14
## i8->h39 i9->h39 i10->h39 i11->h39 i12->h39 i13->h39 i14->h39 i15->h39
## 0.31 -0.18 0.11 0.13 0.05 -0.03 0.07 -0.09
## i16->h39 i17->h39 i18->h39 i19->h39 i20->h39 i21->h39 i22->h39 i23->h39
## -0.11 0.22 -0.15 -0.04 -0.08 -0.25 -0.01 -0.01
## i24->h39 i25->h39 i26->h39 i27->h39 i28->h39 i29->h39 i30->h39
## -0.34 -0.30 -0.39 -0.09 -0.05 0.16 0.07
## b->h40 i1->h40 i2->h40 i3->h40 i4->h40 i5->h40 i6->h40 i7->h40
## -0.03 -0.08 0.04 -0.07 0.07 0.15 0.09 0.07
## i8->h40 i9->h40 i10->h40 i11->h40 i12->h40 i13->h40 i14->h40 i15->h40
## 0.37 -0.12 -0.12 0.20 -0.08 -0.06 0.13 -0.17
## i16->h40 i17->h40 i18->h40 i19->h40 i20->h40 i21->h40 i22->h40 i23->h40
## -0.11 0.20 -0.23 0.09 -0.11 -0.22 0.01 -0.10
## i24->h40 i25->h40 i26->h40 i27->h40 i28->h40 i29->h40 i30->h40
## -0.37 -0.19 -0.44 -0.12 0.03 0.00 0.10
## b->h41 i1->h41 i2->h41 i3->h41 i4->h41 i5->h41 i6->h41 i7->h41
## -0.05 -0.05 0.05 -0.06 0.10 0.16 0.07 0.05
## i8->h41 i9->h41 i10->h41 i11->h41 i12->h41 i13->h41 i14->h41 i15->h41
## 0.38 -0.13 -0.15 0.21 -0.10 -0.07 0.15 -0.19
## i16->h41 i17->h41 i18->h41 i19->h41 i20->h41 i21->h41 i22->h41 i23->h41
## -0.14 0.22 -0.25 0.08 -0.13 -0.20 0.02 -0.10
## i24->h41 i25->h41 i26->h41 i27->h41 i28->h41 i29->h41 i30->h41
## -0.38 -0.22 -0.46 -0.15 0.06 0.01 0.09
## b->h42 i1->h42 i2->h42 i3->h42 i4->h42 i5->h42 i6->h42 i7->h42
## 0.05 0.32 0.24 0.06 0.47 -0.34 0.43 0.42
## i8->h42 i9->h42 i10->h42 i11->h42 i12->h42 i13->h42 i14->h42 i15->h42
## 0.45 0.59 0.04 0.75 -0.32 0.04 -0.10 -0.17
## i16->h42 i17->h42 i18->h42 i19->h42 i20->h42 i21->h42 i22->h42 i23->h42
## 0.45 -0.10 -0.27 0.26 -0.47 0.22 -0.03 -0.92
## i24->h42 i25->h42 i26->h42 i27->h42 i28->h42 i29->h42 i30->h42
## 0.09 0.24 -0.34 0.48 -0.08 -0.27 0.23
## b->h43 i1->h43 i2->h43 i3->h43 i4->h43 i5->h43 i6->h43 i7->h43
## -0.01 0.05 0.06 0.00 0.14 -0.13 0.25 0.22
## i8->h43 i9->h43 i10->h43 i11->h43 i12->h43 i13->h43 i14->h43 i15->h43
## 0.32 0.20 0.02 0.45 -0.04 0.02 0.02 -0.13
## i16->h43 i17->h43 i18->h43 i19->h43 i20->h43 i21->h43 i22->h43 i23->h43
## 0.18 0.13 -0.17 0.10 -0.23 -0.05 0.07 -0.38
## i24->h43 i25->h43 i26->h43 i27->h43 i28->h43 i29->h43 i30->h43
## -0.18 0.06 -0.30 0.12 0.01 -0.15 0.18
## b->h44 i1->h44 i2->h44 i3->h44 i4->h44 i5->h44 i6->h44 i7->h44
## 0.00 -0.11 0.03 -0.07 0.04 0.13 0.11 0.09
## i8->h44 i9->h44 i10->h44 i11->h44 i12->h44 i13->h44 i14->h44 i15->h44
## 0.34 -0.09 -0.07 0.17 -0.05 -0.03 0.10 -0.14
## i16->h44 i17->h44 i18->h44 i19->h44 i20->h44 i21->h44 i22->h44 i23->h44
## -0.07 0.17 -0.20 0.10 -0.09 -0.24 0.01 -0.10
## i24->h44 i25->h44 i26->h44 i27->h44 i28->h44 i29->h44 i30->h44
## -0.36 -0.15 -0.40 -0.07 0.00 0.00 0.12
## b->h45 i1->h45 i2->h45 i3->h45 i4->h45 i5->h45 i6->h45 i7->h45
## -0.06 -0.07 0.05 -0.07 0.09 0.14 0.07 0.07
## i8->h45 i9->h45 i10->h45 i11->h45 i12->h45 i13->h45 i14->h45 i15->h45
## 0.38 -0.12 -0.12 0.22 -0.07 -0.05 0.15 -0.18
## i16->h45 i17->h45 i18->h45 i19->h45 i20->h45 i21->h45 i22->h45 i23->h45
## -0.12 0.21 -0.23 0.06 -0.11 -0.20 0.02 -0.15
## i24->h45 i25->h45 i26->h45 i27->h45 i28->h45 i29->h45 i30->h45
## -0.37 -0.20 -0.45 -0.13 0.06 -0.02 0.10
## b->h46 i1->h46 i2->h46 i3->h46 i4->h46 i5->h46 i6->h46 i7->h46
## -0.02 -0.07 0.04 0.02 -0.10 0.12 0.18 0.15
## i8->h46 i9->h46 i10->h46 i11->h46 i12->h46 i13->h46 i14->h46 i15->h46
## 0.32 -0.16 0.10 0.14 0.06 -0.01 0.07 -0.09
## i16->h46 i17->h46 i18->h46 i19->h46 i20->h46 i21->h46 i22->h46 i23->h46
## -0.08 0.19 -0.13 -0.03 -0.08 -0.27 0.00 -0.05
## i24->h46 i25->h46 i26->h46 i27->h46 i28->h46 i29->h46 i30->h46
## -0.35 -0.29 -0.40 -0.08 -0.05 0.13 0.10
## b->h47 i1->h47 i2->h47 i3->h47 i4->h47 i5->h47 i6->h47 i7->h47
## -0.01 -0.10 0.03 -0.07 0.05 0.14 0.11 0.09
## i8->h47 i9->h47 i10->h47 i11->h47 i12->h47 i13->h47 i14->h47 i15->h47
## 0.35 -0.10 -0.08 0.18 -0.06 -0.04 0.11 -0.15
## i16->h47 i17->h47 i18->h47 i19->h47 i20->h47 i21->h47 i22->h47 i23->h47
## -0.08 0.18 -0.21 0.10 -0.09 -0.23 0.01 -0.10
## i24->h47 i25->h47 i26->h47 i27->h47 i28->h47 i29->h47 i30->h47
## -0.36 -0.17 -0.41 -0.09 0.01 0.00 0.11
## b->h48 i1->h48 i2->h48 i3->h48 i4->h48 i5->h48 i6->h48 i7->h48
## 0.04 0.11 0.08 0.03 0.20 -0.16 0.29 0.29
## i8->h48 i9->h48 i10->h48 i11->h48 i12->h48 i13->h48 i14->h48 i15->h48
## 0.37 0.33 0.01 0.54 -0.12 0.06 -0.03 -0.11
## i16->h48 i17->h48 i18->h48 i19->h48 i20->h48 i21->h48 i22->h48 i23->h48
## 0.25 0.06 -0.19 0.18 -0.29 0.04 0.05 -0.49
## i24->h48 i25->h48 i26->h48 i27->h48 i28->h48 i29->h48 i30->h48
## -0.13 0.13 -0.29 0.23 -0.01 -0.18 0.19
## b->h49 i1->h49 i2->h49 i3->h49 i4->h49 i5->h49 i6->h49 i7->h49
## -0.08 -0.05 0.04 -0.08 0.06 -0.03 0.17 0.13
## i8->h49 i9->h49 i10->h49 i11->h49 i12->h49 i13->h49 i14->h49 i15->h49
## 0.24 -0.01 0.01 0.29 0.05 -0.01 0.11 -0.16
## i16->h49 i17->h49 i18->h49 i19->h49 i20->h49 i21->h49 i22->h49 i23->h49
## 0.04 0.23 -0.13 -0.04 -0.13 -0.15 0.09 -0.21
## i24->h49 i25->h49 i26->h49 i27->h49 i28->h49 i29->h49 i30->h49
## -0.27 -0.09 -0.34 -0.03 0.06 -0.10 0.14
## b->h50 i1->h50 i2->h50 i3->h50 i4->h50 i5->h50 i6->h50 i7->h50
## -0.05 -0.40 -0.62 0.80 0.42 0.41 -1.60 -0.88
## i8->h50 i9->h50 i10->h50 i11->h50 i12->h50 i13->h50 i14->h50 i15->h50
## -0.59 0.20 0.58 -0.25 0.45 -0.64 0.28 -0.20
## i16->h50 i17->h50 i18->h50 i19->h50 i20->h50 i21->h50 i22->h50 i23->h50
## -0.53 -1.60 0.59 0.96 -1.67 -0.54 0.01 -0.35
## i24->h50 i25->h50 i26->h50 i27->h50 i28->h50 i29->h50 i30->h50
## 0.49 -0.04 0.06 -0.56 0.39 -0.35 0.03
## b->o h1->o h2->o h3->o h4->o h5->o h6->o h7->o h8->o h9->o h10->o
## 2.04 0.83 0.50 0.84 0.90 0.79 0.79 -0.22 0.47 0.73 0.47
## h11->o h12->o h13->o h14->o h15->o h16->o h17->o h18->o h19->o h20->o h21->o
## 0.42 0.16 -0.92 -2.04 -0.76 -0.55 -0.36 -0.75 -0.50 -0.18 -0.58
## h22->o h23->o h24->o h25->o h26->o h27->o h28->o h29->o h30->o h31->o h32->o
## -2.34 -2.11 -0.17 -3.57 -1.20 -0.32 -1.45 -1.45 0.46 -0.18 -0.42
## h33->o h34->o h35->o h36->o h37->o h38->o h39->o h40->o h41->o h42->o h43->o
## -1.16 -1.28 -1.40 -0.53 -0.94 0.42 0.87 0.84 0.87 1.49 0.63
## h44->o h45->o h46->o h47->o h48->o h49->o h50->o
## 0.80 0.85 0.90 0.82 0.73 0.57 1.80
Construímos un vector de probabilidades aplicando predict al conjunto de test
nnet_predict <- predict(object = nnet,
newdata = test_nnet,
type = "prob")
nnet_predict <- (nnet_predict)[,2]
head(nnet_predict)
## [1] 0.793380569 0.005590564 0.199922160 0.022146263 0.454357315 0.141001846
Graficamos el modelo generado
plot(nnet_predict~test_nnet$TARGET_Churn)
Con la función umbrales probamos diferentes cortes
umb_nnet<-umbrales(test_nnet$TARGET_Churn,nnet_predict)
umb_nnet
## umbral acierto precision cobertura F1
## 1 0.05 52.18412 34.29131 97.765363 50.773694
## 2 0.10 61.01456 38.82532 94.785847 55.086580
## 3 0.15 66.18131 41.96664 89.013035 57.040573
## 4 0.20 70.73744 45.64777 83.985102 59.147541
## 5 0.25 74.16628 49.26052 80.633147 61.158192
## 6 0.30 76.27994 52.06186 75.232775 61.538462
## 7 0.35 77.92391 54.77889 71.508380 62.035541
## 8 0.40 79.23908 57.57576 67.225326 62.027491
## 9 0.45 79.84969 59.89011 60.893855 60.387812
## 10 0.50 79.99061 61.63522 54.748603 57.988166
## 11 0.55 79.99061 64.41558 46.182495 53.796095
## 12 0.60 80.22546 68.83117 39.478585 50.177515
## 13 0.65 79.66181 70.31250 33.519553 45.397226
## 14 0.70 78.76938 73.22404 24.953445 37.222222
## 15 0.75 78.15876 77.69231 18.808194 30.284858
## 16 0.80 77.45420 81.31868 13.780261 23.566879
## 17 0.85 76.84359 86.66667 9.683426 17.420436
## 18 0.90 75.38751 93.33333 2.607076 5.072464
## 19 0.95 0.95000 0.95000 0.950000 0.950000
Seleccionamos automáticamente el mejor umbral
umbral_final_nnet<-umb_nnet[which.max(umb_nnet$F1),1]
umbral_final_nnet
## [1] 0.35
Evaluamos la matriz de confusión y las métricas con el umbral optimizado
confusion(test_nnet$TARGET_Churn,nnet_predict,umbral_final_nnet)
##
## real FALSE TRUE
## No 1275 317
## Yes 153 384
nnet_metricas<-filter(umb_nnet,umbral==umbral_final_nnet)
nnet_metricas
## umbral acierto precision cobertura F1
## 1 0.35 77.92391 54.77889 71.50838 62.03554
Evaluamos la ROC
#creamos el objeto prediction
nnet_prediction<-prediction(nnet_predict,test_nnet$TARGET_Churn)
#visualizamos la ROC
roc(nnet_prediction)
Sacamos las métricas definitivas incluyendo el AUC
nnet_metricas<-cbind(nnet_metricas,AUC=round(auc(nnet_prediction),2)*100)
print(t(nnet_metricas))
## [,1]
## umbral 0.35000
## acierto 77.92391
## precision 54.77889
## cobertura 71.50838
## F1 62.03554
## AUC 84.00000
4.3.9 - Comparamos los 6 métodos
comparativa <- rbind(rl_metricas,ar_metricas,rf_metricas,svm_metricas,gbm_metricas,nnet_metricas)
rownames(comparativa) <- c('Regresion Logistica','Arbol Decision','Random Forest', 'SVM', 'Gradient Boosting','Red Neuronal')
t(comparativa) #t simplemente transpone para leerlo mejor
## Regresion Logistica Arbol Decision Random Forest SVM
## umbral 0.40000 0.30000 0.30000 0.30000
## acierto 77.12541 78.53452 78.95726 78.20573
## precision 53.56125 56.45161 57.23577 55.37555
## cobertura 70.01862 65.17691 65.54935 70.01862
## F1 60.69411 60.50130 61.11111 61.84211
## AUC 82.00000 81.00000 82.00000 83.00000
## Gradient Boosting Red Neuronal
## umbral 0.40000 0.35000
## acierto 79.56787 77.92391
## precision 58.25243 54.77889
## cobertura 67.03911 71.50838
## F1 62.33766 62.03554
## AUC 85.00000 84.00000
Conclusión:
El modelo con la métrica “AUC” mayor es el Gradient Boosting.
Podríamos intentar mejorar la calidad del modelo realizando más ajustes de hiperparámetros, incluso con técinas de ensamblado.
4.3.10 - Escribimos el scoring final en el dataset y guardamos el modelo
df$SCORING_CHURN <- predict(gbm,df_gbm,type = 'prob')[,2]
saveRDS(rl,'03_modelo_final_V2.rds')
saveRDS(df,'cacheV4.rds')
Vamos a recuperar el dataset anterior a las discretizaciones
df <- readRDS(file = 'cacheV1.rds')
6.1 - Calculamos la tasa de abandono actual
Previamente visualizamos el número de clientes por clase de la target
df %>% ggplot(aes(TARGET_Churn, fill=TARGET_Churn))+
geom_bar() +
geom_text(stat='count', aes(label=stat(count))) +
labs(title='Desglose del número de clientes en TARGET_Churn') + theme(plot.title = element_text(size = 10))
Cálculo de la tasa churn
df %>%
summarise(tasa_churn = round(sum(TARGET_Churn == 1) / nrow(df) * 100, 2))
## tasa_churn
## 1 26.58
Esta tasa arroja un valor actualmente del 26,58%, totalmente inasumible desde el punto de vista de negocio. Conquistar a un nuevo cliente cuesta de 5 a 7 veces más que mantener a uno antiguo.
Tasa churn de las operadoras móvil en España en Mayo 2018
6.2 - Cálculo del tiempo de vida medio
tvm <- round(1 / (sum(df$TARGET_Churn == 1) / nrow(df)), 2)
tvm
## [1] 3.76
Si no se produjesen nuevas altas, la cartera de esta telco desaparecería en 3,76 años
6.3 - Cálculo del ticket medio mensual
df %>% group_by(TARGET_Churn) %>%
summarise(ticket_medio = mean(MonthlyCharges))
## # A tibble: 2 x 2
## TARGET_Churn ticket_medio
## <fct> <dbl>
## 1 0 61.3
## 2 1 74.4
tick_med_churn <- df %>% filter(TARGET_Churn==1) %>%
summarise(ticket_medio = round(mean(MonthlyCharges),2))
tick_med_actual <- df %>% filter(TARGET_Churn==0) %>%
summarise(ticket_medio = round(mean(MonthlyCharges),2))
El problema se agrava, el ticket medio de los clientes que abandonan, es mayor que el de los que permanecen
6.4 - Valor anualizado de la venta anual de los clientes perdidos
df %>% filter(TARGET_Churn==1) %>%
summarise(val_vta_anual = sum(MonthlyCharges) * 12)
## val_vta_anual
## 1 1669570
Valor anualizado de las ventas totales (perdidos + retenidos)
df %>%
group_by(TARGET_Churn) %>%
summarise(val_vta_anual = sum(MonthlyCharges) * 12) %>%
mutate(porc=(val_vta_anual / sum(val_vta_anual)*100))
## # A tibble: 2 x 3
## TARGET_Churn val_vta_anual porc
## <fct> <dbl> <dbl>
## 1 0 3798362. 69.5
## 2 1 1669570. 30.5
El porcentaje de ventas perdidas asciende a un 30,5%
Vamos a ver en qué segmento de antigüedad es mayor el valor de la pérdida
df %>% filter(TARGET_Churn==1) %>%
mutate(tenure_disc=case_when(tenure <=12 ~ '01_1Año',
tenure > 12 & tenure<=24 ~ '02_2Años',
tenure > 24 & tenure<=36 ~ '03_3Años',
tenure > 36 & tenure<=48 ~ '04_4Años',
tenure > 48 & tenure<=60 ~ '05_5Años',
tenure > 60 & tenure<=72 ~ '06_6Años',
TRUE ~ '07_ERROR'
)) %>%
group_by(tenure_disc) %>%
summarise(val_vta_anual = sum(MonthlyCharges) * 12) %>%
mutate(porc=(val_vta_anual / sum(val_vta_anual)*100))
## # A tibble: 6 x 3
## tenure_disc val_vta_anual porc
## <chr> <dbl> <dbl>
## 1 01_1Año 827451 49.6
## 2 02_2Años 276980. 16.6
## 3 03_3Años 182015. 10.9
## 4 04_4Años 147535. 8.84
## 5 05_5Años 126983. 7.61
## 6 06_6Años 108607. 6.51
Constatamos que el 49,6% de la pérdida anual de venta es de clientes recientes, permanencia menos de un año
Vamos a ver en ese mismo segmento, cómo se distribyue la masa de clientes perdidos
df %>% filter(TARGET_Churn==1) %>%
mutate(tenure_disc=case_when(tenure <=12 ~ '01_1Año',
tenure > 12 & tenure<=24 ~ '02_2Años',
tenure > 24 & tenure<=36 ~ '03_3Años',
tenure > 36 & tenure<=48 ~ '04_4Años',
tenure > 48 & tenure<=60 ~ '05_5Años',
tenure > 60 & tenure<=72 ~ '06_6Años',
TRUE ~ '07_ERROR'
)) %>%
group_by(tenure_disc) %>%
summarise(cli_perd = n()) %>%
mutate(porc=(cli_perd / sum(cli_perd)*100))
## # A tibble: 6 x 3
## tenure_disc cli_perd porc
## <chr> <int> <dbl>
## 1 01_1Año 1037 55.5
## 2 02_2Años 294 15.7
## 3 03_3Años 180 9.63
## 4 04_4Años 145 7.76
## 5 05_5Años 120 6.42
## 6 06_6Años 93 4.98
num_cli_churn <- df %>% filter(TARGET_Churn==1) %>%
summarise(cli_perd = n())
Y en numero de clientes, el porcentaje de clientes recientes perdidos asciende al 55,5%.
Desde el punto de vista de negocio, habría que analizar estos resultados, averiguar por qué la mayor tasa de abandono se produce en clientes con poca antigüedad, si es debido a que no se cumplen sus expectativas, o si abandonan después de un periodo de prueba.
6.5 - Cálculo del tiempo medio de permanencia
df %>% group_by(TARGET_Churn) %>%
summarise(tiempo_medio_perm = mean(tenure))
## # A tibble: 2 x 2
## TARGET_Churn tiempo_medio_perm
## <fct> <dbl>
## 1 0 37.7
## 2 1 18.0
df %>% group_by(TARGET_Churn) %>%
summarise(tiempo_medio_perm = mean(tenure)) %>%
ggplot(aes(TARGET_Churn, tiempo_medio_perm, fill = TARGET_Churn)) +
geom_col() +
geom_text(aes(label = round(tiempo_medio_perm, 2)), vjust = -0.5, size=3) +
labs(title='Tiempo medio de permanencia por clase en TARGET_Churn') + theme(plot.title = element_text(size = 10))
Vemos claramente como el tiempo medio de permanencia entre los clientes retenidos dobla al de los clientes que abandonaron
6.6 - Análisis de la probabilidad de abandono - SCORINGS
Recuperamos el data frame con los SCORINGS
df <- readRDS(file = 'cacheV4.rds')
Vamos a visualizar el abandono real por tramos de scoring. Este gráfico es muy potente para ver que el modelo es consistente, ya que debe presentar una linea descendente en la tasa de abandono conforme se desciende en el scoring
#Creamos una función para visualizar el abandono real por percentiles de scoring
vis <- function(scoring,real) {
#Preparar el dataframe de visualización
vis_df <- data.frame(Scoring = scoring, Perc_Scoring = cut_number(scoring, 20), Real = real)
levels(vis_df$Perc_Scoring) <- seq(from = 100,to = 5,by = -5)
vis_gr <- vis_df %>% group_by(Perc_Scoring) %>% summarise(Tasa_Contr = mean(as.numeric(as.character(Real)))) %>% arrange(Perc_Scoring)
#ordenar el factor para el gráfico
vis_gr$Perc_Scoring <- factor(vis_gr$Perc_Scoring, levels = vis_gr$Perc_Scoring[order(vis_gr$Perc_Scoring, decreasing = T)])
#hacemos el gráfico
ggplot(vis_gr,aes(Perc_Scoring, Tasa_Contr)) +
geom_col(fill='grey') +
geom_hline(aes(yintercept = mean(as.numeric(as.character(vis_df$Real)))), col = 'black') +
labs(title = 'Abandono real por tramo de scoring', x = 'Tramo de Scoring', y = 'Abandono real')
}
vis(df$SCORING_CHURN,df$TARGET_Churn)
Constatamos una caída gradual a medida que cae el scoring de los clientes, lo que buscábamos.
Visualizamos los 15 clientes de mayor scoring, de entre los que no abandonaron. Estos serían los clientes activos con mayor probabilidad de causar baja
cliente_vivo_riesgo <- df %>% filter(TARGET_Churn==0) %>%
select(customerID, SCORING_CHURN) %>%
top_n(15, wt=SCORING_CHURN)
cliente_vivo_riesgo
## customerID SCORING_CHURN
## 1: 0021-IKXGC 0.8941015
## 2: 1452-VOQCH 0.8526383
## 3: 6630-UJZMY 0.8495921
## 4: 7439-DKZTW 0.8454919
## 5: 8775-ERLNB 0.8496881
## 6: 1393-IMKZG 0.8367554
## 7: 9603-OAIHC 0.8539592
## 8: 5150-ITWWB 0.8577245
## 9: 2545-EBUPK 0.8368027
## 10: 3878-AVSOQ 0.8630476
## 11: 7577-SWIFR 0.8747460
## 12: 1941-HOSAM 0.8650702
## 13: 4273-MBHYA 0.8616340
## 14: 4912-PIGUY 0.8876554
## 15: 9605-WGJVW 0.8630476
Vamos a segmentar a los clientes actuales por tramos de riesgo.
Creamos los tramos
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
group_by(tramo_riesgo) %>%
summarise(media_sco = mean(SCORING_CHURN))
## # A tibble: 4 x 2
## tramo_riesgo media_sco
## <fct> <dbl>
## 1 sin_riesgo 0.0593
## 2 riesgo_bajo 0.363
## 3 riesgo_medio 0.671
## 4 alto_riesgo 0.842
Comprobamos que el scoring medio por segmento aumenta a medida que aumenta el tramo de riesgo
Visualizamos la media de scoring a través de los tramos de riesgo
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
group_by(tramo_riesgo) %>%
summarise(media_sco = mean(SCORING_CHURN)) %>%
ggplot(aes(tramo_riesgo, media_sco)) +
geom_col() +
geom_text(aes(label = round(media_sco, 2)), vjust = -0.5)
Calculamos el número de clientes a través de los tramos de riesgo
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
group_by(tramo_riesgo) %>%
summarise(num_cli = n())
## # A tibble: 4 x 2
## tramo_riesgo num_cli
## <fct> <int>
## 1 sin_riesgo 3317
## 2 riesgo_bajo 1578
## 3 riesgo_medio 244
## 4 alto_riesgo 24
Visualizamos el número de clientes por tramos de riesgo
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
group_by(tramo_riesgo) %>%
summarise(num_cli = n()) %>%
ggplot(aes(tramo_riesgo, num_cli)) +
geom_col() +
geom_text(aes(label = round(num_cli, 3)), vjust = -0.5)
6.7 - Diseño de estrategias de retención
Clientes con ALTO RIESGO de abandono
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
filter(TARGET_Churn == 0 & tramo_riesgo=="alto_riesgo") %>%
select(customerID, tramo_riesgo, SCORING_CHURN) %>%
arrange(desc(SCORING_CHURN)) %>%
head(20)
## customerID tramo_riesgo SCORING_CHURN
## 1: 0021-IKXGC alto_riesgo 0.8941015
## 2: 4912-PIGUY alto_riesgo 0.8876554
## 3: 7577-SWIFR alto_riesgo 0.8747460
## 4: 1941-HOSAM alto_riesgo 0.8650702
## 5: 3878-AVSOQ alto_riesgo 0.8630476
## 6: 9605-WGJVW alto_riesgo 0.8630476
## 7: 4273-MBHYA alto_riesgo 0.8616340
## 8: 5150-ITWWB alto_riesgo 0.8577245
## 9: 9603-OAIHC alto_riesgo 0.8539592
## 10: 1452-VOQCH alto_riesgo 0.8526383
## 11: 8775-ERLNB alto_riesgo 0.8496881
## 12: 6630-UJZMY alto_riesgo 0.8495921
## 13: 7439-DKZTW alto_riesgo 0.8454919
## 14: 2545-EBUPK alto_riesgo 0.8368027
## 15: 1393-IMKZG alto_riesgo 0.8367554
## 16: 2018-QKYGT alto_riesgo 0.8349782
## 17: 1640-PLFMP alto_riesgo 0.8231423
## 18: 1628-BIZYP alto_riesgo 0.8195199
## 19: 5542-TBBWB alto_riesgo 0.8119826
## 20: 7465-ZZRVX alto_riesgo 0.8119826
ESTRATEGIA:
1. Diseñar un plan de acción inmediata
2. Elaborar un informe detallado a gerencia
3. Definir un plan personalizado de visitas comercial u otra acción similar
Clientes con RIESGO MEDIO de abandono
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
filter(TARGET_Churn == 0 & tramo_riesgo=="riesgo_medio") %>%
select(customerID, tramo_riesgo, SCORING_CHURN) %>%
arrange(desc(SCORING_CHURN)) %>%
head(20)
## customerID tramo_riesgo SCORING_CHURN
## 1: 8309-IEYJD riesgo_medio 0.7955180
## 2: 8739-XNIKG riesgo_medio 0.7899105
## 3: 3320-VEOYC riesgo_medio 0.7881429
## 4: 0916-QOFDP riesgo_medio 0.7863516
## 5: 7365-BVCJH riesgo_medio 0.7803026
## 6: 4927-WWOOZ riesgo_medio 0.7772956
## 7: 7398-SKNQZ riesgo_medio 0.7759054
## 8: 2799-ARNLO riesgo_medio 0.7689117
## 9: 9799-CAYJJ riesgo_medio 0.7689117
## 10: 3318-NMQXL riesgo_medio 0.7686961
## 11: 0248-IPDFW riesgo_medio 0.7622616
## 12: 2254-DLXRI riesgo_medio 0.7602768
## 13: 8242-SOQUO riesgo_medio 0.7578951
## 14: 3841-CONLJ riesgo_medio 0.7570932
## 15: 7590-VHVEG riesgo_medio 0.7557346
## 16: 7044-YAACC riesgo_medio 0.7553471
## 17: 0187-QSXOE riesgo_medio 0.7549581
## 18: 9822-OAOVB riesgo_medio 0.7533455
## 19: 4929-XIHVW riesgo_medio 0.7520621
## 20: 0674-EYYZV riesgo_medio 0.7520071
ESTRATEGIA:
1. Diseñar un plan de acción a medio plazo
2. Monitorización de los clientes
Clientes con RIESGO BAJO de abandono
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
filter(TARGET_Churn == 0 & tramo_riesgo=="riesgo_bajo") %>%
select(customerID, tramo_riesgo, SCORING_CHURN) %>%
arrange(desc(SCORING_CHURN)) %>%
head(20)
## customerID tramo_riesgo SCORING_CHURN
## 1: 2985-FMWYF riesgo_bajo 0.5999150
## 2: 4583-PARNH riesgo_bajo 0.5988846
## 3: 3496-LFSZU riesgo_bajo 0.5972395
## 4: 1608-GMEWB riesgo_bajo 0.5970879
## 5: 2275-RBYQS riesgo_bajo 0.5963537
## 6: 9214-EKVXR riesgo_bajo 0.5963537
## 7: 5857-TYBCJ riesgo_bajo 0.5962431
## 8: 5271-DBYSJ riesgo_bajo 0.5962187
## 9: 5019-GQVCR riesgo_bajo 0.5961399
## 10: 3234-VKACU riesgo_bajo 0.5954418
## 11: 1379-FRVEB riesgo_bajo 0.5947246
## 12: 6267-DCFFZ riesgo_bajo 0.5925335
## 13: 1754-GKYPY riesgo_bajo 0.5922341
## 14: 7284-BUYEC riesgo_bajo 0.5911408
## 15: 1942-OQFRW riesgo_bajo 0.5908475
## 16: 0988-JRWWP riesgo_bajo 0.5906659
## 17: 5442-BHQNG riesgo_bajo 0.5902136
## 18: 5989-PGKJB riesgo_bajo 0.5896601
## 19: 7517-LDMPS riesgo_bajo 0.5896601
## 20: 1866-OBPNR riesgo_bajo 0.5886855
ESTRATEGIA:
1. Diseñar un plan de acción a largo plazo
2. Seguimiento de los clientes
Clientes SIN RIESGO de abandono
df %>%
filter(TARGET_Churn==0) %>%
mutate(tramo_riesgo = cut(SCORING_CHURN, breaks = c(0, 0.2, 0.6, 0.8, 1), labels = c("sin_riesgo", "riesgo_bajo", "riesgo_medio", "alto_riesgo"))) %>%
filter(TARGET_Churn == 0 & tramo_riesgo=="sin_riesgo") %>% select(customerID, tramo_riesgo, SCORING_CHURN) %>%
arrange(desc(SCORING_CHURN)) %>%
head(20)
## customerID tramo_riesgo SCORING_CHURN
## 1: 7103-IPXPJ sin_riesgo 0.1999091
## 2: 4826-TZEVA sin_riesgo 0.1999012
## 3: 0723-VSOBE sin_riesgo 0.1997020
## 4: 1530-ZTDOZ sin_riesgo 0.1995553
## 5: 5325-UWTWJ sin_riesgo 0.1990856
## 6: 0956-ACVZC sin_riesgo 0.1990284
## 7: 8250-ZNGGW sin_riesgo 0.1989830
## 8: 7824-PANSQ sin_riesgo 0.1988941
## 9: 2305-MRGLV sin_riesgo 0.1988747
## 10: 0959-WHOKV sin_riesgo 0.1986284
## 11: 4747-LCAQL sin_riesgo 0.1985534
## 12: 2674-MLXMN sin_riesgo 0.1985047
## 13: 1629-DQQVB sin_riesgo 0.1984607
## 14: 0961-ZWLVI sin_riesgo 0.1984509
## 15: 7789-CRUVC sin_riesgo 0.1984477
## 16: 1710-RCXUS sin_riesgo 0.1981808
## 17: 7319-VENRZ sin_riesgo 0.1980338
## 18: 1518-OMDIK sin_riesgo 0.1979941
## 19: 3694-GLTJM sin_riesgo 0.1979846
## 20: 5712-VBOXD sin_riesgo 0.1979846
6.8 - Otras estrategias
¿Como decidimos los clientes a los que nos dirigiremos para evitar el abandono?
Opción 1: El tamaño de campaña viene definido por un criterio de negocio como por ejemplo el presupuesto total asignado a la acción
#Supongamos que tenemos un presupuesto de 15.000€
#Y que la campaña se realizara por call center, con un coste unitario de 20€ por cliente contactado
#Entonces el numero de clientes a contactar sera de 15.000 / 20 = 750
#Para extraerlos simplemente filtramos del total de la base de clientes a aquellos que ya abandonaron y después cogemos los 750 primeros ordenados por scoring
presupuesto <- 15000
coste_unit <- 20
tamaño_campaña <- presupuesto / coste_unit
lim_sup_sin_riesgo <- 0.2
lim_sup_riesgo_bajo <- 0.6
lim_sup_riesgo_medio <- 0.8
bote_campaña <- df %>%
filter(TARGET_Churn==0) %>%
arrange(desc(SCORING_CHURN)) %>%
slice(1:tamaño_campaña) %>%
select(customerID,SCORING_CHURN)
#Previsualizamos la salida
head(bote_campaña,50)
## customerID SCORING_CHURN
## 1: 0021-IKXGC 0.8941015
## 2: 4912-PIGUY 0.8876554
## 3: 7577-SWIFR 0.8747460
## 4: 1941-HOSAM 0.8650702
## 5: 3878-AVSOQ 0.8630476
## 6: 9605-WGJVW 0.8630476
## 7: 4273-MBHYA 0.8616340
## 8: 5150-ITWWB 0.8577245
## 9: 9603-OAIHC 0.8539592
## 10: 1452-VOQCH 0.8526383
## 11: 8775-ERLNB 0.8496881
## 12: 6630-UJZMY 0.8495921
## 13: 7439-DKZTW 0.8454919
## 14: 2545-EBUPK 0.8368027
## 15: 1393-IMKZG 0.8367554
## 16: 2018-QKYGT 0.8349782
## 17: 1640-PLFMP 0.8231423
## 18: 1628-BIZYP 0.8195199
## 19: 5542-TBBWB 0.8119826
## 20: 7465-ZZRVX 0.8119826
## 21: 0817-HSUSE 0.8109055
## 22: 3489-HHPFY 0.8066445
## 23: 6350-XFYGW 0.8022597
## 24: 1197-BVMVG 0.8007057
## 25: 8309-IEYJD 0.7955180
## 26: 8739-XNIKG 0.7899105
## 27: 3320-VEOYC 0.7881429
## 28: 0916-QOFDP 0.7863516
## 29: 7365-BVCJH 0.7803026
## 30: 4927-WWOOZ 0.7772956
## 31: 7398-SKNQZ 0.7759054
## 32: 2799-ARNLO 0.7689117
## 33: 9799-CAYJJ 0.7689117
## 34: 3318-NMQXL 0.7686961
## 35: 0248-IPDFW 0.7622616
## 36: 2254-DLXRI 0.7602768
## 37: 8242-SOQUO 0.7578951
## 38: 3841-CONLJ 0.7570932
## 39: 7590-VHVEG 0.7557346
## 40: 7044-YAACC 0.7553471
## 41: 0187-QSXOE 0.7549581
## 42: 9822-OAOVB 0.7533455
## 43: 4929-XIHVW 0.7520621
## 44: 0674-EYYZV 0.7520071
## 45: 2262-SLNVK 0.7516582
## 46: 4847-TAJYI 0.7495297
## 47: 4115-NZRKS 0.7495297
## 48: 9402-ORRAH 0.7495297
## 49: 2740-TVLFN 0.7484343
## 50: 4090-KPJIP 0.7479404
## customerID SCORING_CHURN
#Vamos a ver gráficamente si de esta forma estamos aprovechando el potencial de nuestro modelo
tasa_churn <- mean(as.numeric(as.character(df$TARGET_Churn)))
df %>%
filter(TARGET_Churn==0) %>%
arrange(desc(SCORING_CHURN)) %>%
ggplot(aes(y = SCORING_CHURN, x = seq_along(SCORING_CHURN))) +
geom_line() +
geom_vline(xintercept = tamaño_campaña, col = 'orange') +
geom_text(x=750, y=tasa_churn, label="tasa_churn", hjust="left", vjust=1, size=3) +
geom_hline(yintercept = tasa_churn,col='blue') +
geom_text(x=750, y=lim_sup_sin_riesgo, label="sin_riesgo", hjust="left", vjust=1, size=3) +
geom_hline(yintercept = lim_sup_sin_riesgo,col='green') +
geom_text(x=750, y=lim_sup_riesgo_bajo, label="riesgo_bajo", hjust="left", vjust=1,size=3) +
geom_hline(yintercept = lim_sup_riesgo_bajo,col='yellow') +
geom_text(x=750, y=lim_sup_riesgo_medio, label="riesgo_medio", hjust="left", vjust=1,size=3) +
geom_hline(yintercept = lim_sup_riesgo_medio,col='red') +
labs(x = 'CLIENTES ORDENADOS POR SCORING', y = 'SCORING')
Ejecutando esta opción, la acción alcanzaría a todos los clientes con scoring en tramo alto riesgo y riesgo medio, y a una parte importante de los catalogados como riesgo bajo. No estaríamos optimizando parte de los recursos destinados al presupuesto
Opción 2: Establecemos un criterio de negocio. Seleccionamos aquellos clientes que tengan un scoring de hasta un valor de la tasa churn actual multiplicada x2, y x3.
scoring_x2 <- tasa_churn * 2
scoring_x3 <- tasa_churn * 3
bote_campaña_x2 <- df %>%
filter(TARGET_Churn==0 & SCORING_CHURN > scoring_x2) %>%
select(customerID ,SCORING_CHURN)
#Tamaño del bote_campaña_x2
nrow(bote_campaña_x2)
## [1] 429
bote_campaña_x3 <- df %>%
filter(TARGET_Churn==0 & SCORING_CHURN > scoring_x3) %>%
select(customerID ,SCORING_CHURN)
#Tamaño del bote_campaña_x3
nrow(bote_campaña_x3)
## [1] 24
#Vamos a ver gráficamente si de esta forma estamos aprovechando el potencial de nuestro modelo
df %>%
filter(TARGET_Churn==0) %>%
arrange(desc(SCORING_CHURN)) %>%
ggplot(aes(y = SCORING_CHURN, x = seq_along(SCORING_CHURN))) +
geom_line() +
geom_hline(yintercept = scoring_x2,col='blue') +
geom_text(x=24, y=scoring_x3, label="tasa_churn_x3", hjust="left", vjust=1,size=3) +
geom_hline(yintercept = scoring_x3,col='orange') +
geom_text(x=500, y=scoring_x2, label="tasa_churn_x2", hjust="left", vjust=1,size=3) +
geom_hline(yintercept = tasa_churn,col='black') +
geom_text(x=1600, y=tasa_churn, label="tasa_churn", hjust="left", vjust=1,size=3) +
labs(x = 'CLIENTES ORDENADOS POR SCORING', y = 'SCORING')
Opcion 3: Calculamos el tamaño máximo de campaña que resultaría rentable, teniendo en cuenta el margen medio previsto por retención del cliente y el coste medio por accion comercial
#Supuesto un margen neto de un 10%, y utilizando el ticket medio mensual de los clientes retenidos
#Supuesto de coste medio por acción comercial (call center cliente contactado) = 20€
#Definición de margen esperado = probabilidad de evento * margen evento
#Definición de margen neto = margen esperado - coste medio
marg_medio_ret_anualizado <- (61.31 * 0.1 * 12)
coste_medio <- 20
#Calculamos el margen esperado de cada cliente
df_campaña <- df %>%
filter(TARGET_Churn==0) %>%
mutate(
ME = SCORING_CHURN * marg_medio_ret_anualizado,
MN = ME - coste_medio) %>%
arrange(desc(MN)) %>%
mutate(INDICE = 1:nrow(.)) %>%
select(customerID ,INDICE,ME,MN)
head(df_campaña,50)
## customerID INDICE ME MN
## 1: 0021-IKXGC 1 65.78083 45.78083
## 2: 4912-PIGUY 2 65.30658 45.30658
## 3: 7577-SWIFR 3 64.35681 44.35681
## 4: 1941-HOSAM 4 63.64494 43.64494
## 5: 3878-AVSOQ 5 63.49614 43.49614
## 6: 9605-WGJVW 6 63.49614 43.49614
## 7: 4273-MBHYA 7 63.39214 43.39214
## 8: 5150-ITWWB 8 63.10450 43.10450
## 9: 9603-OAIHC 9 62.82749 42.82749
## 10: 1452-VOQCH 10 62.73031 42.73031
## 11: 8775-ERLNB 11 62.51325 42.51325
## 12: 6630-UJZMY 12 62.50619 42.50619
## 13: 7439-DKZTW 13 62.20453 42.20453
## 14: 2545-EBUPK 14 61.56525 41.56525
## 15: 1393-IMKZG 15 61.56177 41.56177
## 16: 2018-QKYGT 16 61.43101 41.43101
## 17: 1640-PLFMP 17 60.56023 40.56023
## 18: 1628-BIZYP 18 60.29372 40.29372
## 19: 5542-TBBWB 19 59.73918 39.73918
## 20: 7465-ZZRVX 20 59.73918 39.73918
## 21: 0817-HSUSE 21 59.65994 39.65994
## 22: 3489-HHPFY 22 59.34645 39.34645
## 23: 6350-XFYGW 23 59.02385 39.02385
## 24: 1197-BVMVG 24 58.90952 38.90952
## 25: 8309-IEYJD 25 58.52785 38.52785
## 26: 8739-XNIKG 26 58.11529 38.11529
## 27: 3320-VEOYC 27 57.98525 37.98525
## 28: 0916-QOFDP 28 57.85346 37.85346
## 29: 7365-BVCJH 29 57.40842 37.40842
## 30: 4927-WWOOZ 30 57.18719 37.18719
## 31: 7398-SKNQZ 31 57.08491 37.08491
## 32: 2799-ARNLO 32 56.57037 36.57037
## 33: 9799-CAYJJ 33 56.57037 36.57037
## 34: 3318-NMQXL 34 56.55451 36.55451
## 35: 0248-IPDFW 35 56.08111 36.08111
## 36: 2254-DLXRI 36 55.93509 35.93509
## 37: 8242-SOQUO 37 55.75986 35.75986
## 38: 3841-CONLJ 38 55.70086 35.70086
## 39: 7590-VHVEG 39 55.60091 35.60091
## 40: 7044-YAACC 40 55.57240 35.57240
## 41: 0187-QSXOE 41 55.54378 35.54378
## 42: 9822-OAOVB 42 55.42513 35.42513
## 43: 4929-XIHVW 43 55.33071 35.33071
## 44: 0674-EYYZV 44 55.32667 35.32667
## 45: 2262-SLNVK 45 55.30099 35.30099
## 46: 4847-TAJYI 46 55.14440 35.14440
## 47: 4115-NZRKS 47 55.14440 35.14440
## 48: 9402-ORRAH 48 55.14440 35.14440
## 49: 2740-TVLFN 49 55.06381 35.06381
## 50: 4090-KPJIP 50 55.02747 35.02747
## customerID INDICE ME MN
Visualizamos las curvas
#Localizamos el punto en el que el margen neto pasa a ser cero o menos
MN_cero <- df_campaña %>% filter(MN <= 0 ) %>% slice(1) %>% select(INDICE)
MN_cero <- MN_cero$INDICE
#Hacemos el gráfico
ggplot(df_campaña,aes(x = INDICE)) +
geom_line(aes(y = ME, col = "ME")) +
geom_line(aes(y = MN, col = "MN")) +
geom_hline(aes(yintercept = coste_medio, col = 'COSTE MEDIO')) +
geom_vline(aes(xintercept = MN_cero, col = 'MARGEN NETO CERO')) +
labs(x = 'TAMAÑO DE CAMPAÑA', y = 'MARGEN', colour = 'KPI')
print(paste('Tamaño maximo de campaña rentable: ',MN_cero))
## [1] "Tamaño maximo de campaña rentable: 1427"
Opción 4: Calculamos el punto optimo de retorno de la inversión
Vamos a calcular 2 nuevas variables que sean un agregado de los ingresos agregados y de los gastos agregados en cada potencial tamaño de campaña, y el ROI como diferencia de las anteriores, y vamos a localizar el tamaño de la campaña que va a maximizar ese ROI y también cuanto vamos a ganar previsiblemente
#vamos a usar la función cumsum(), que hace una suma acumulada secuencial de la variable que pasamos como parámetro
df_campaña <- df_campaña %>%
mutate(
INGRESOS_AGRE = cumsum(ME),
COSTES_AGRE = INDICE * coste_medio,
ROI = INGRESOS_AGRE - COSTES_AGRE)
head(df_campaña,50)
## customerID INDICE ME MN INGRESOS_AGRE COSTES_AGRE ROI
## 1: 0021-IKXGC 1 65.78083 45.78083 65.78083 20 45.78083
## 2: 4912-PIGUY 2 65.30658 45.30658 131.08742 40 91.08742
## 3: 7577-SWIFR 3 64.35681 44.35681 195.44423 60 135.44423
## 4: 1941-HOSAM 4 63.64494 43.64494 259.08917 80 179.08917
## 5: 3878-AVSOQ 5 63.49614 43.49614 322.58531 100 222.58531
## 6: 9605-WGJVW 6 63.49614 43.49614 386.08145 120 266.08145
## 7: 4273-MBHYA 7 63.39214 43.39214 449.47359 140 309.47359
## 8: 5150-ITWWB 8 63.10450 43.10450 512.57809 160 352.57809
## 9: 9603-OAIHC 9 62.82749 42.82749 575.40558 180 395.40558
## 10: 1452-VOQCH 10 62.73031 42.73031 638.13589 200 438.13589
## 11: 8775-ERLNB 11 62.51325 42.51325 700.64914 220 480.64914
## 12: 6630-UJZMY 12 62.50619 42.50619 763.15533 240 523.15533
## 13: 7439-DKZTW 13 62.20453 42.20453 825.35986 260 565.35986
## 14: 2545-EBUPK 14 61.56525 41.56525 886.92511 280 606.92511
## 15: 1393-IMKZG 15 61.56177 41.56177 948.48688 300 648.48688
## 16: 2018-QKYGT 16 61.43101 41.43101 1009.91790 320 689.91790
## 17: 1640-PLFMP 17 60.56023 40.56023 1070.47812 340 730.47812
## 18: 1628-BIZYP 18 60.29372 40.29372 1130.77184 360 770.77184
## 19: 5542-TBBWB 19 59.73918 39.73918 1190.51102 380 810.51102
## 20: 7465-ZZRVX 20 59.73918 39.73918 1250.25021 400 850.25021
## 21: 0817-HSUSE 21 59.65994 39.65994 1309.91015 420 889.91015
## 22: 3489-HHPFY 22 59.34645 39.34645 1369.25660 440 929.25660
## 23: 6350-XFYGW 23 59.02385 39.02385 1428.28045 460 968.28045
## 24: 1197-BVMVG 24 58.90952 38.90952 1487.18997 480 1007.18997
## 25: 8309-IEYJD 25 58.52785 38.52785 1545.71782 500 1045.71782
## 26: 8739-XNIKG 26 58.11529 38.11529 1603.83311 520 1083.83311
## 27: 3320-VEOYC 27 57.98525 37.98525 1661.81836 540 1121.81836
## 28: 0916-QOFDP 28 57.85346 37.85346 1719.67182 560 1159.67182
## 29: 7365-BVCJH 29 57.40842 37.40842 1777.08024 580 1197.08024
## 30: 4927-WWOOZ 30 57.18719 37.18719 1834.26743 600 1234.26743
## 31: 7398-SKNQZ 31 57.08491 37.08491 1891.35234 620 1271.35234
## 32: 2799-ARNLO 32 56.57037 36.57037 1947.92271 640 1307.92271
## 33: 9799-CAYJJ 33 56.57037 36.57037 2004.49309 660 1344.49309
## 34: 3318-NMQXL 34 56.55451 36.55451 2061.04760 680 1381.04760
## 35: 0248-IPDFW 35 56.08111 36.08111 2117.12870 700 1417.12870
## 36: 2254-DLXRI 36 55.93509 35.93509 2173.06379 720 1453.06379
## 37: 8242-SOQUO 37 55.75986 35.75986 2228.82365 740 1488.82365
## 38: 3841-CONLJ 38 55.70086 35.70086 2284.52451 760 1524.52451
## 39: 7590-VHVEG 39 55.60091 35.60091 2340.12542 780 1560.12542
## 40: 7044-YAACC 40 55.57240 35.57240 2395.69782 800 1595.69782
## 41: 0187-QSXOE 41 55.54378 35.54378 2451.24159 820 1631.24159
## 42: 9822-OAOVB 42 55.42513 35.42513 2506.66673 840 1666.66673
## 43: 4929-XIHVW 43 55.33071 35.33071 2561.99744 860 1701.99744
## 44: 0674-EYYZV 44 55.32667 35.32667 2617.32411 880 1737.32411
## 45: 2262-SLNVK 45 55.30099 35.30099 2672.62510 900 1772.62510
## 46: 4847-TAJYI 46 55.14440 35.14440 2727.76950 920 1807.76950
## 47: 4115-NZRKS 47 55.14440 35.14440 2782.91390 940 1842.91390
## 48: 9402-ORRAH 48 55.14440 35.14440 2838.05830 960 1878.05830
## 49: 2740-TVLFN 49 55.06381 35.06381 2893.12211 980 1913.12211
## 50: 4090-KPJIP 50 55.02747 35.02747 2948.14958 1000 1948.14958
## customerID INDICE ME MN INGRESOS_AGRE COSTES_AGRE ROI
Visualizamos las curvas
ggplot(df_campaña,aes(x = INDICE)) +
geom_line(aes(y = INGRESOS_AGRE, col='INGRESOS_AGRE')) +
geom_line(aes(y = COSTES_AGRE, col='COSTES_AGRE')) +
geom_line(aes(y = ROI, col='ROI')) +
labs(y='EUROS', x = 'TAMAÑO DE CAMPAÑA', colour = 'KPI')
Vamos a visualizar un zoom sobre el ROI solo en los tamaños de campaña que son positivos para localizar el punto optimo
df_campaña %>%
filter(ROI > 0) %>%
ggplot(aes(x = INDICE)) +
geom_line(aes(y = ROI, col='ROI')) +
geom_vline(aes(xintercept = MN_cero, col = 'PUNTO OPTIMO')) +
labs(x = 'TAMAÑO DE CAMPAÑA', y = 'ROI', colour = 'KPI')
#Generamos un pequeño informe
cat(
paste(
'El tamaño óptimo de campaña para el ROI es de:', MN_cero, 'clientes',
'\nCon unos ingresos esperados de margen neto acumulado de:', round(df_campaña[which(df_campaña$INDICE == MN_cero),'INGRESOS_AGRE']), '€',
'\nY unos costes agregados de:',
df_campaña[which(df_campaña$INDICE == MN_cero),'COSTES_AGRE'], '€',
'\nQue van a generar un Retorno Neto de la Inversión de:',
round(df_campaña[which(df_campaña$INDICE == MN_cero),'ROI']),'€'
)
)
## El tamaño óptimo de campaña para el ROI es de: 1427 clientes
## Con unos ingresos esperados de margen neto acumulado de: 48510 €
## Y unos costes agregados de: 28540 €
## Que van a generar un Retorno Neto de la Inversión de: 19970 €