1. IMPORTACIÓN Y MUESTREO

1.1 Preparación del entorno

#desactivar notación científica
options(scipen=999)

#carga de librerías
library(dplyr)
library(ggplot2)
library(randomForest)
library(ROCR)
library(purrr)
library(smbinning)
library(rpart)
library(rpart.plot)
library(prettydoc)

1.2 Carga de datos

#carga de datos

df <- read.csv("Telco-Customer-Churn.csv")

2. CALIDAD DE DATOS

2.1 Análisis exploratorio general y estadísticos básicos.

Mostramos las variables del dataframe y sus tipos

# Visualizamos las variables del dataframe y sus tipos
glimpse(df)

## Observations: 7,043
## Variables: 21
## $ customerID       <fct> 7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOCW,…
## $ gender           <fct> Female, Male, Male, Male, Female, Female, Male,…
## $ SeniorCitizen    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Partner          <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes, …
## $ Dependents       <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes, …
## $ tenure           <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58,…
## $ PhoneService     <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, …
## $ MultipleLines    <fct> No phone service, No, No, No phone service, No,…
## $ InternetService  <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic, F…
## $ OnlineSecurity   <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Ye…
## $ OnlineBackup     <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, No,…
## $ DeviceProtection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, No,…
## $ TechSupport      <fct> No, No, No, Yes, No, No, No, No, Yes, No, No, N…
## $ StreamingTV      <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No, …
## $ StreamingMovies  <fct> No, No, No, No, No, Yes, No, No, Yes, No, No, N…
## $ Contract         <fct> Month-to-month, One year, Month-to-month, One y…
## $ PaperlessBilling <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No, Y…
## $ PaymentMethod    <fct> Electronic check, Mailed check, Mailed check, B…
## $ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10…
## $ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50…
## $ Churn            <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, No,…

#visualizamos los estadísticos de cada variable, en formato dataframe con lapply
lapply(df,summary)

## $customerID
## 0002-ORFBO 0003-MKNFE 0004-TLHLJ 0011-IGKFF 0013-EXCHZ 0013-MHZWF 
##          1          1          1          1          1          1 
## 0013-SMEOE 0014-BMAQU 0015-UOCOJ 0016-QLJIS 0017-DINOC 0017-IUDMW 
##          1          1          1          1          1          1 
## 0018-NYROU 0019-EFAEP 0019-GFNTW 0020-INWCK 0020-JDNXP 0021-IKXGC 
##          1          1          1          1          1          1 
## 0022-TCJCI 0023-HGHWL 0023-UYUPN 0023-XUOPT 0027-KWYKW 0030-FNXPP 
##          1          1          1          1          1          1 
## 0031-PVLZI 0032-PGELS 0036-IHMOT 0040-HALCW 0042-JVWOJ 0042-RLHYP 
##          1          1          1          1          1          1 
## 0048-LUMLS 0048-PIHNL 0052-DCKON 0052-YNYOT 0056-EPFBG 0057-QBUQH 
##          1          1          1          1          1          1 
## 0058-EVZWM 0060-FUALY 0064-SUDOG 0064-YIJGF 0067-DKWBL 0068-FIGTF 
##          1          1          1          1          1          1 
## 0071-NDAFP 0074-HDKDG 0076-LVEPS 0078-XZMHT 0080-EMYVY 0080-OROZO 
##          1          1          1          1          1          1 
## 0082-LDZUE 0082-OQIQY 0083-PIVIK 0089-IIQKO 0093-EXYQL 0093-XWZFY 
##          1          1          1          1          1          1 
## 0094-OIFMO 0096-BXERS 0096-FCPUF 0098-BOWSO 0100-DUVFC 0103-CSITQ 
##          1          1          1          1          1          1 
## 0104-PPXDV 0106-GHRQR 0106-UGRDO 0107-WESLM 0107-YHINA 0111-KLBQG 
##          1          1          1          1          1          1 
## 0112-QAWRZ 0112-QWPNC 0114-IGABW 0114-PEGZZ 0114-RSRRW 0115-TFERT 
##          1          1          1          1          1          1 
## 0117-LFRMW 0118-JPNOY 0121-SNYRK 0122-OAHPZ 0123-CRBRT 0125-LZQXK 
##          1          1          1          1          1          1 
## 0128-MKWSG 0129-KPTWJ 0129-QMPDR 0130-SXOUN 0133-BMFZO 0134-XWXCE 
##          1          1          1          1          1          1 
## 0135-NMXAP 0136-IFMYD 0137-OCGAB 0137-UDEUO 0139-IVFJG 0141-YEAYS 
##          1          1          1          1          1          1 
## 0142-GVYSN 0147-ESWWR 0148-DCDOS 0151-ONTOV 0156-FVPTA 0164-APGRB 
##          1          1          1          1          1          1 
## 0164-XAIRP 0168-XZKBB 0174-QRVVY    (Other) 
##          1          1          1       6944 
## 
## $gender
## Female   Male 
##   3488   3555 
## 
## $SeniorCitizen
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1621  0.0000  1.0000 
## 
## $Partner
##   No  Yes 
## 3641 3402 
## 
## $Dependents
##   No  Yes 
## 4933 2110 
## 
## $tenure
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   29.00   32.37   55.00   72.00 
## 
## $PhoneService
##   No  Yes 
##  682 6361 
## 
## $MultipleLines
##               No No phone service              Yes 
##             3390              682             2971 
## 
## $InternetService
##         DSL Fiber optic          No 
##        2421        3096        1526 
## 
## $OnlineSecurity
##                  No No internet service                 Yes 
##                3498                1526                2019 
## 
## $OnlineBackup
##                  No No internet service                 Yes 
##                3088                1526                2429 
## 
## $DeviceProtection
##                  No No internet service                 Yes 
##                3095                1526                2422 
## 
## $TechSupport
##                  No No internet service                 Yes 
##                3473                1526                2044 
## 
## $StreamingTV
##                  No No internet service                 Yes 
##                2810                1526                2707 
## 
## $StreamingMovies
##                  No No internet service                 Yes 
##                2785                1526                2732 
## 
## $Contract
## Month-to-month       One year       Two year 
##           3875           1473           1695 
## 
## $PaperlessBilling
##   No  Yes 
## 2872 4171 
## 
## $PaymentMethod
## Bank transfer (automatic)   Credit card (automatic) 
##                      1544                      1522 
##          Electronic check              Mailed check 
##                      2365                      1612 
## 
## $MonthlyCharges
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.25   35.50   70.35   64.76   89.85  118.75 
## 
## $TotalCharges
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    18.8   401.4  1397.5  2283.3  3794.7  8684.8      11 
## 
## $Churn
##   No  Yes 
## 5174 1869

Conclusiones

Tras este primer vistazo detectamos que:

La variable customerID no aporta información, la eliminamos

df$customerID <- NULL

La variable SeniorCitizen con valores 0 y 1 debería ser de tipo factor. Cambiamos a “Yes” / “No” para igualarlas al resto de variables del conjunto de datos.

df <- df %>%
  mutate (SeniorCitizen = as.factor(ifelse(SeniorCitizen == 1,"Yes","No")))

La variables MultipleLines depende de la variable PhoneService, por lo que tiene, además de los valores “Yes” y “No”, el valor de “No phone service”. Cambiamos el “No phone service” por “No”

df$MultipleLines <- recode(df$MultipleLines, "No phone service"="No")

La variable InternetService contiene el tipo de producto de internet contratado.
Las variables OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTv y StreamingMovies hacen referencia a productos que el cliente solo puede contratar si tiene InternetService de algún tipo. Lo apuntamos para cambiar los “No internet service” a “No” en todas las variables.

# Creamos un vector con los nombres de las variables a cambiar
internet <- c("OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport",
              "StreamingTV","StreamingMovies")

# Por cada columna, cambiamos con recode el valor del "No internet service" por "No"
for (i in internet){
df[[i]] <- recode (df[[i]], "No internet service" = "No")
}

La variable tenure es de tipo entero, con 72 valores diferentes.
La variable Churn, que será la variable TARGET, es de tipo factor, habrá que transformarla en un factor con valores 0 y 1. Visualizamos los valores para comprobar si existe desbalanceo:

# Visualizamos la penetración de la variable target:
table(df$Churn)

## 
##   No  Yes 
## 5174 1869

ggplot (data=df,aes(x=Churn)) + geom_bar()

2.2 Análisis de nulos / Análisis de ceros

Visualizamos las variables que tiene valores NA

summary(is.na.data.frame(df))

##    gender        SeniorCitizen    Partner        Dependents     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:7043      FALSE:7043      FALSE:7043      FALSE:7043     
##                                                                 
##    tenure        PhoneService    MultipleLines   InternetService
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:7043      FALSE:7043      FALSE:7043      FALSE:7043     
##                                                                 
##  OnlineSecurity  OnlineBackup    DeviceProtection TechSupport    
##  Mode :logical   Mode :logical   Mode :logical    Mode :logical  
##  FALSE:7043      FALSE:7043      FALSE:7043       FALSE:7043     
##                                                                  
##  StreamingTV     StreamingMovies  Contract       PaperlessBilling
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical   
##  FALSE:7043      FALSE:7043      FALSE:7043      FALSE:7043      
##                                                                  
##  PaymentMethod   MonthlyCharges  TotalCharges      Churn        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:7043      FALSE:7043      FALSE:7032      FALSE:7043     
##                                  TRUE :11

Conclusiones

Solo hay una variable con NA,TotalCharges, con un total de 11 valores nulos. Comprobaremos si los 11 valores nulos de esta variable tienen alguna explicación de negocio.

# creamos una df con los valores nulos de TotalCharges
nulos <- df[is.na(df$TotalCharges),]

#visualizamos un resumen de las variables
lapply(nulos,summary)

## $gender
## Female   Male 
##      5      6 
## 
## $SeniorCitizen
##  No Yes 
##  11   0 
## 
## $Partner
##  No Yes 
##   2   9 
## 
## $Dependents
##  No Yes 
##   0  11 
## 
## $tenure
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0 
## 
## $PhoneService
##  No Yes 
##   2   9 
## 
## $MultipleLines
##  No Yes 
##   7   4 
## 
## $InternetService
##         DSL Fiber optic          No 
##           5           0           6 
## 
## $OnlineSecurity
##  No Yes 
##   7   4 
## 
## $OnlineBackup
##  No Yes 
##   7   4 
## 
## $DeviceProtection
##  No Yes 
##   7   4 
## 
## $TechSupport
##  No Yes 
##   7   4 
## 
## $StreamingTV
##  No Yes 
##   7   4 
## 
## $StreamingMovies
##  No Yes 
##  10   1 
## 
## $Contract
## Month-to-month       One year       Two year 
##              0              1             10 
## 
## $PaperlessBilling
##  No Yes 
##   8   3 
## 
## $PaymentMethod
## Bank transfer (automatic)   Credit card (automatic) 
##                         2                         1 
##          Electronic check              Mailed check 
##                         0                         8 
## 
## $MonthlyCharges
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.70   20.12   25.75   41.42   58.98   80.85 
## 
## $TotalCharges
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      NA      NA      NA     NaN      NA      NA      11 
## 
## $Churn
##  No Yes 
##  11   0

De los 11 valores nulos en NA:

11 no son “SeniorCitizen”
11 son “Dependents”
Los 11 tienen un tenure de 0
Todos tienen “MonthlyCharges” > 0
Todos tienen “Churn” de 0

Podríamos entender que son clientes que acaban de contratar el servicio, que tienen cargos mensuales que no se han sumado al total.

Teniendo en cuenta el número reducido de clientes y que son 0 para la variable target, decidimos eliminarlos.

df <- df[complete.cases(df),]

2.3 Análisis de atípicos

Analizamos de forma gráfica las variables continuas.

# Histograma de la variable "tenure"
ggplot(df,aes(x=tenure)) + geom_histogram(binwidth = 1)

# Histograma de la variable "MonthlyCharges"
ggplot(df,aes(x=MonthlyCharges)) + geom_histogram(binwidth = 10)

# Histograma de la variable "TotalCharges"
ggplot(df,aes(x=TotalCharges)) + geom_histogram(binwidth = 10)

Conclusiones

La variable “tenure” recoge el tiempo de permanencia de un cliente, tiene una distribución que podemos entender según el tipo de negocio. Existe un mayor número de clientes con poco tiempo de permanencia, que va decreciendo con el tiempo hasta estabilizarse a partir de los 20 meses aproximadamente. El pico que se observa en el extremo contrario, de más antigüedad, podría explicarse bien por un cambio en el sistema de registro en un momento determinado y que hasta entonces no se hubiese recogido la fecha de alta, o bien por pérdida de datos en algún momento. Apuntamos la variable para discretizarla.
Las variables “MonthlyCharges” y “TotalCharges” tienen una distribución que entra dentro de lo esperable. Las apuntamos para discretizarlas.

2.5 Análisis longitudinal - Otros análisis

2.6 Acciones según resultados del análisis exploratorio

Eliminar variables sin información (ID)
Regularizar “SeniorCitizen” a factor “Yes”/“No” al igual que las demás variables
Cambiar los valores “no phone service” y “no internet service” de las variables que los tienen a “No”
Eliminar las filas con valores nulos en “TotalCharges”

(Las acciones ya se han llevado a cabo en el apartado de )

3. TRANSFORMACIÓN

3.1 Creación de la variable target y preselección de las variables independientes

Creamos la variable TARGET a partir de la variable Churn como factor con los valores O (no) y 1 (sí). Eliminamos la variable Churn.

df <- df %>%
  mutate (TARGET = as.factor(as.numeric(ifelse(Churn == "Yes",1,0)))) %>%
  select(-Churn)

Creamos un vector con las primeras variables independientes.

independientes <- setdiff(names(df),"TARGET")

3.2 Preselección de variables con Random Forest

Como primer paso, creamos un nuevo DataFrame con una muestra del conjunto de datos inicial. El tamaño de la muestra lo calculamos en https://es.surveymonkey.com/mp/sample-size-calculator/ con un nivel de confianza del 95% y un márgen de error del 5%, siendo el resultado 365.

set.seed(0123)
muestra <- sample_n(df,365)

A partir de la muestra, hacemos la primera preselección con Random Forest

# creamos el Random Forest
pre_rf <- randomForest(formula = reformulate(independientes,"TARGET"), 
                       data= muestra,mtry=2,ntree=50, importance = T)
# seleccionamos el decrecimiento de Gini
imp_rf <- importance(pre_rf)[,4] 

# transformamos a DataFrame y creamos el ranking.
imp_rf <- data.frame(VARIABLE = names(imp_rf), IMP_RF = imp_rf) 

imp_rf <- imp_rf %>% arrange(desc(IMP_RF)) %>% 
  mutate(RANKING_RF = 1:nrow(imp_rf))

# visualizamos el ranking_rf
imp_rf

##            VARIABLE    IMP_RF RANKING_RF
## 1            tenure 14.580215          1
## 2      TotalCharges 13.985598          2
## 3    MonthlyCharges 13.181016          3
## 4          Contract  7.223082          4
## 5     PaymentMethod  6.597656          5
## 6   InternetService  5.354578          6
## 7  DeviceProtection  3.782765          7
## 8     MultipleLines  3.403536          8
## 9    OnlineSecurity  3.372931          9
## 10      TechSupport  3.330475         10
## 11 PaperlessBilling  3.197936         11
## 12       Dependents  2.844183         12
## 13          Partner  2.618785         13
## 14           gender  2.520908         14
## 15    SeniorCitizen  2.462045         15
## 16  StreamingMovies  2.444497         16
## 17     OnlineBackup  2.046391         17
## 18      StreamingTV  2.045911         18
## 19     PhoneService  1.344579         19

3.3 Preselección de variables con Information Value

Con la misma muestra, hacemos una preselección con Information Value, obteniendo el ranking de variables con este método.

#creamos una nueva muestra con la variable TARGET pasada a numérico (0 y 1)
muestra2 <- mutate(muestra,TARGET = as.numeric(as.character(TARGET))) 

# calculamos el IV y creamos el ranking
imp_iv <- smbinning.sumiv(muestra2[c(independientes,'TARGET')],y="TARGET")

##  
## 
  |                                                        
  |                                                  |   0%
  |                                                        
  |--                                                |   5%
  |                                                        
  |-----                                             |  10%
  |                                                        
  |--------                                          |  15%
  |                                                        
  |----------                                        |  20%
  |                                                        
  |------------                                      |  25%
  |                                                        
  |---------------                                   |  30%
  |                                                        
  |------------------                                |  35%
  |                                                        
  |--------------------                              |  40%
  |                                                        
  |----------------------                            |  45%
  |                                                        
  |-------------------------                         |  50%
  |                                                        
  |----------------------------                      |  55%
  |                                                        
  |------------------------------                    |  60%
  |                                                        
  |--------------------------------                  |  65%
  |                                                        
  |-----------------------------------               |  70%
  |                                                        
  |--------------------------------------            |  75%
  |                                                        
  |----------------------------------------          |  80%
  |                                                        
  |------------------------------------------        |  85%
  |                                                        
  |---------------------------------------------     |  90%
  |                                                        
  |------------------------------------------------  |  95%
  |                                                        
  |--------------------------------------------------| 100%
##

imp_iv <- imp_iv %>% mutate(Ranking = 1:nrow(imp_iv)) %>% select(-Process)
names(imp_iv) <- c('VARIABLE','IMP_IV','RANKING_IV')

# visualizamos el ranking_IV
imp_iv

##            VARIABLE IMP_IV RANKING_IV
## 1          Contract 1.3887          1
## 2            tenure 0.7035          2
## 3   InternetService 0.5192          3
## 4     PaymentMethod 0.4660          4
## 5        Dependents 0.3621          5
## 6      TotalCharges 0.2918          6
## 7    MonthlyCharges 0.2597          7
## 8    OnlineSecurity 0.1945          8
## 9       TechSupport 0.1832          9
## 10    SeniorCitizen 0.1523         10
## 11 PaperlessBilling 0.1215         11
## 12          Partner 0.1139         12
## 13    MultipleLines 0.0450         13
## 14      StreamingTV 0.0392         14
## 15     OnlineBackup 0.0340         15
## 16  StreamingMovies 0.0049         16
## 17           gender 0.0030         17
## 18 DeviceProtection 0.0018         18
## 19     PhoneService 0.0000         19

3.4 Preselección final

Creamos un ranking final para decidir las variables independientes que pasarán a la fase de modelización.

# creamos el ranking de importancia final
imp_final <- inner_join(imp_rf,imp_iv,by='VARIABLE') %>% 
  select(VARIABLE,IMP_RF,IMP_IV,RANKING_RF,RANKING_IV) %>% 
  mutate(RANKING_TOTAL = RANKING_RF + RANKING_IV) %>% 
  arrange(RANKING_TOTAL)

## Warning: Column `VARIABLE` joining factors with different levels, coercing
## to character vector

# visualizamos el ranking
imp_final

##            VARIABLE    IMP_RF IMP_IV RANKING_RF RANKING_IV RANKING_TOTAL
## 1            tenure 14.580215 0.7035          1          2             3
## 2          Contract  7.223082 1.3887          4          1             5
## 3      TotalCharges 13.985598 0.2918          2          6             8
## 4     PaymentMethod  6.597656 0.4660          5          4             9
## 5   InternetService  5.354578 0.5192          6          3             9
## 6    MonthlyCharges 13.181016 0.2597          3          7            10
## 7    OnlineSecurity  3.372931 0.1945          9          8            17
## 8        Dependents  2.844183 0.3621         12          5            17
## 9       TechSupport  3.330475 0.1832         10          9            19
## 10    MultipleLines  3.403536 0.0450          8         13            21
## 11 PaperlessBilling  3.197936 0.1215         11         11            22
## 12 DeviceProtection  3.782765 0.0018          7         18            25
## 13          Partner  2.618785 0.1139         13         12            25
## 14    SeniorCitizen  2.462045 0.1523         15         10            25
## 15           gender  2.520908 0.0030         14         17            31
## 16  StreamingMovies  2.444497 0.0049         16         16            32
## 17     OnlineBackup  2.046391 0.0340         17         15            32
## 18      StreamingTV  2.045911 0.0392         18         14            32
## 19     PhoneService  1.344579 0.0000         19         19            38

Descartamos las variables que tengan un ranking mayor de 10 en cualquiera de las dos listas. Descartamos también las variables que vamos a discretizar.

independientes <- imp_final %>%
  filter(RANKING_RF <= 10 | RANKING_IV <= 10) %>% 
  select(VARIABLE) 
independientes <- as.character(independientes$VARIABLE)

3.5. Creación de variables sintéticas

Detectamos que existen una serie de variables que muestran la contratación o no de un producto relacionado con internet. Creamos una variable que resuma el número de productos contratados.

# creamos un dataframe temporal con las variables a resumir
df_temp <- df[,9:14]

# cambiamos el tipo de variable a numérico
a_numero = names(df_temp)
df_temp [a_numero] <- lapply(df_temp[a_numero],as.numeric)

# restamos 1 para que quede como 0 y 1
df_temp[a_numero] <- df_temp [a_numero]-1


# creamos la variable en el dataframe temporal
df_temp <- df_temp %>%
  mutate (PRODUCTOS_CONTRATADOS = (
    OnlineSecurity + OnlineBackup + DeviceProtection + 
      TechSupport + StreamingTV + StreamingMovies))%>%
  select(-a_numero)


# pasamos la variable a factor
df_temp$PRODUCTOS_CONTRATADOS <- as.factor(df_temp$PRODUCTOS_CONTRATADOS)

# pegamos el dataframe temporal al dataframe de trabajo
df <- cbind(df,df_temp)

3.6 Discretización

Creamos una función para discretizar de forma automática una variable

discretizar <- function(vi,target){
  temp_df <- data.frame(vi = vi, target = target)
    temp_df$target <- as.numeric(as.character(temp_df$target))
  disc <- smbinning(temp_df, y = 'target', x = 'vi')
  return(disc)
}

Discretizamos las variables “tenure”, “TotalCharges” y “MonthlyCharges”

# discretización de "tenure"

disc_temp_tenure <- discretizar (df$tenure,df$TARGET)
df_temp <- select (df,tenure,TARGET)
df_temp <- smbinning.gen(df_temp,disc_temp_tenure,chrname="tenure_DISC")
df <- cbind(df,df_temp[3]) %>% select(-tenure)

# discretizacion de "TotalCharges"

disc_temp_TotalCharges <- discretizar (df$TotalCharges,df$TARGET)
df_temp <- select (df,TotalCharges,TARGET)
df_temp <- smbinning.gen(df_temp,disc_temp_TotalCharges,chrname="TotalCharges_DISC")
df <- cbind(df,df_temp[3]) %>% select(-TotalCharges)

# discretizacion de "MonthlyCharges"

disc_temp_MonthlyCharges <- discretizar (df$MonthlyCharges,df$TARGET)
df_temp <- select (df,MonthlyCharges,TARGET)
df_temp <- smbinning.gen(df_temp,disc_temp_MonthlyCharges,chrname="MonthlyCharges_DISC")
df <- cbind(df,df_temp[3]) %>% select(-MonthlyCharges)

# quitamos las variables de la lista de independientes

quitar <- c("tenure","TotalCharges","MonthlyCharges")
independientes <- setdiff(independientes,quitar)

3.7 Preparación variables independientes finales

Finalmente utilizaremos las variables independientes que pasaron la fase de preselección junto con las variables discretizadas y la variable sintética creada.

nuevas <- c("tenure_DISC","TotalCharges_DISC","MonthlyCharges_DISC","PRODUCTOS_CONTRATADOS")
independientes <- append(independientes,nuevas)


# visualizamos las variables independientes finales
independientes

##  [1] "Contract"              "PaymentMethod"        
##  [3] "InternetService"       "OnlineSecurity"       
##  [5] "Dependents"            "TechSupport"          
##  [7] "MultipleLines"         "DeviceProtection"     
##  [9] "SeniorCitizen"         "tenure_DISC"          
## [11] "TotalCharges_DISC"     "MonthlyCharges_DISC"  
## [13] "PRODUCTOS_CONTRATADOS"

4. MODELIZACIÓN

4.1 Funciones a utilizar

Creamos las funciones que utilizaremos para evaluar cada modelo. Funcion para crear una matriz de confusion

# crear matriz de confusión
confusion<-function(real,scoring,umbral){ 
  conf<-table(real,scoring>=umbral)
  if(ncol(conf)==2) return(conf) else return(NULL)
}

# calcular las métricas: acierto, precisión, cobertura y F1
metricas<-function(matriz_conf){
  acierto <- (matriz_conf[1,1] + matriz_conf[2,2]) / sum(matriz_conf) *100
  precision <- matriz_conf[2,2] / (matriz_conf[2,2] + matriz_conf[1,2]) *100
  cobertura <- matriz_conf[2,2] / (matriz_conf[2,2] + matriz_conf[2,1]) *100
  F1 <- 2*precision*cobertura/(precision+cobertura)
  salida<-c(acierto,precision,cobertura,F1)
  return(salida)
}

# calcular umbrales y ver su efecto sobre precisión y cobertura
umbrales<-function(real,scoring){
  umbrales<-data.frame(umbral=rep(0,times=19),acierto=rep(0,times=19),precision=rep(0,times=19),cobertura=rep(0,times=19),F1=rep(0,times=19))
  cont <- 1
  for (cada in seq(0.05,0.95,by = 0.05)){
    datos<-metricas(confusion(real,scoring,cada))
    registro<-c(cada,datos)
    umbrales[cont,]<-registro
    cont <- cont + 1
  }
  return(umbrales)
}

# calcular ROC
roc<-function(prediction){
  r<-performance(prediction,'tpr','fpr')
  plot(r)
}

# calcular AUC
auc<-function(prediction){
  a<-performance(prediction,'auc')
  return(a@y.values[[1]])
}

4.2 Creación de los conjuntos de entramiento y validación

Dividimos aleatoriamente el conjunto de datos en un conjunto de entrenamiento (train) y un conjunto de validación (test) con una distribución de 70:30.

# creamos una variable aleatoria en el dataframe, con una distribución 70:30
set.seed(0123)
df$random<-sample(0:1,size = nrow(df),replace = T,prob = c(0.3,0.7))

# creamos los dataframe train y test
train<-filter(df,random==1)
test<-filter(df,random==0)

#eliminamos la variable "random" en el dataframe inicial
df$random <- NULL

4.3 Modelización con regresión logística

Creamos la fórmula que utilizaremos en el modelo de regresión logística con las variables independientes y la target

target <- "TARGET"
formula <- reformulate(independientes,target)

Creamos un primer modelo de regresión logística con todas las variables independientes

# creamos una variable formula_rl y le asignamos el valor de la fórmula inicial
formula_rl <- formula

#creamos el primer modelo de regresión logística
rl<- glm(formula_rl,train,family=binomial(link='logit'))

# y lo visualizamos
summary(rl)

## 
## Call:
## glm(formula = formula_rl, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1742  -0.6657  -0.2653   0.5568   3.1136  
## 
## Coefficients:
##                                       Estimate Std. Error z value
## (Intercept)                           0.383811   0.324961   1.181
## ContractOne year                     -0.597070   0.131444  -4.542
## ContractTwo year                     -1.334412   0.228153  -5.849
## PaymentMethodCredit card (automatic) -0.043822   0.137883  -0.318
## PaymentMethodElectronic check         0.328963   0.115308   2.853
## PaymentMethodMailed check            -0.162743   0.141974  -1.146
## InternetServiceFiber optic            1.482211   0.238919   6.204
## InternetServiceNo                    -1.237625   0.327512  -3.779
## OnlineSecurityYes                    -0.674805   0.114451  -5.896
## DependentsYes                        -0.166990   0.098544  -1.695
## TechSupportYes                       -0.493500   0.120835  -4.084
## MultipleLinesYes                      0.466718   0.096974   4.813
## DeviceProtectionYes                  -0.056350   0.113790  -0.495
## SeniorCitizenYes                      0.178727   0.101099   1.768
## tenure_DISC02 <= 5                   -0.730956   0.168282  -4.344
## tenure_DISC03 <= 16                  -1.179034   0.224852  -5.244
## tenure_DISC04 <= 22                  -1.469015   0.255275  -5.755
## tenure_DISC05 <= 49                  -1.851228   0.238841  -7.751
## tenure_DISC06 <= 59                  -2.028977   0.299334  -6.778
## tenure_DISC07 <= 70                  -2.300200   0.360259  -6.385
## tenure_DISC08 > 70                   -3.693413   0.561122  -6.582
## TotalCharges_DISC02 <= 3233.85       -0.339779   0.191924  -1.770
## TotalCharges_DISC03 <= 5643.4        -0.770554   0.260838  -2.954
## TotalCharges_DISC04 > 5643.4         -0.759529   0.374195  -2.030
## MonthlyCharges_DISC02 <= 55.95       -0.005194   0.325252  -0.016
## MonthlyCharges_DISC03 <= 68.8        -0.555718   0.371403  -1.496
## MonthlyCharges_DISC04 <= 106.75      -0.804709   0.402850  -1.998
## MonthlyCharges_DISC05 > 106.75       -1.092936   0.545591  -2.003
## PRODUCTOS_CONTRATADOS1                0.407875   0.143225   2.848
## PRODUCTOS_CONTRATADOS2                0.593690   0.160391   3.702
## PRODUCTOS_CONTRATADOS3                0.761021   0.183625   4.144
## PRODUCTOS_CONTRATADOS4                1.207075   0.228259   5.288
## PRODUCTOS_CONTRATADOS5                1.537477   0.355546   4.324
## PRODUCTOS_CONTRATADOS6                2.158892   0.511698   4.219
##                                                 Pr(>|z|)    
## (Intercept)                                     0.237564    
## ContractOne year                     0.00000556163413087 ***
## ContractTwo year                     0.00000000495223202 ***
## PaymentMethodCredit card (automatic)            0.750623    
## PaymentMethodElectronic check                   0.004332 ** 
## PaymentMethodMailed check                       0.251676    
## InternetServiceFiber optic           0.00000000055112242 ***
## InternetServiceNo                               0.000158 ***
## OnlineSecurityYes                    0.00000000372375898 ***
## DependentsYes                                   0.090157 .  
## TechSupportYes                       0.00004425200666055 ***
## MultipleLinesYes                     0.00000148811220995 ***
## DeviceProtectionYes                             0.620453    
## SeniorCitizenYes                                0.077088 .  
## tenure_DISC02 <= 5                   0.00001401502978932 ***
## tenure_DISC03 <= 16                  0.00000015746941898 ***
## tenure_DISC04 <= 22                  0.00000000868301507 ***
## tenure_DISC05 <= 49                  0.00000000000000913 ***
## tenure_DISC06 <= 59                  0.00000000001215886 ***
## tenure_DISC07 <= 70                  0.00000000017157206 ***
## tenure_DISC08 > 70                   0.00000000004635567 ***
## TotalCharges_DISC02 <= 3233.85                  0.076664 .  
## TotalCharges_DISC03 <= 5643.4                   0.003135 ** 
## TotalCharges_DISC04 > 5643.4                    0.042380 *  
## MonthlyCharges_DISC02 <= 55.95                  0.987259    
## MonthlyCharges_DISC03 <= 68.8                   0.134584    
## MonthlyCharges_DISC04 <= 106.75                 0.045766 *  
## MonthlyCharges_DISC05 > 106.75                  0.045154 *  
## PRODUCTOS_CONTRATADOS1                          0.004402 ** 
## PRODUCTOS_CONTRATADOS2                          0.000214 ***
## PRODUCTOS_CONTRATADOS3               0.00003406643046968 ***
## PRODUCTOS_CONTRATADOS4               0.00000012354227467 ***
## PRODUCTOS_CONTRATADOS5               0.00001530407273830 ***
## PRODUCTOS_CONTRATADOS6               0.00002453105377560 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5696.0  on 4942  degrees of freedom
## Residual deviance: 3987.5  on 4909  degrees of freedom
## AIC: 4055.5
## 
## Number of Fisher Scoring iterations: 6

Creamos una nueva fórmula con las variables que tienen mayor valor predictivo (3 asteriscos en el listado, p<0.001)

 a_mantener <- c("Contract","InternetService","OnlineSecurity","TechSupport",
                "MultipleLines", "tenure_DISC", "PRODUCTOS_CONTRATADOS")

formula_rl <- reformulate (a_mantener,target)

Volvemos a crear el modelo con la nueva fórmula

rl<- glm(formula_rl,train,family=binomial(link='logit'))
summary(rl)

## 
## Call:
## glm(formula = formula_rl, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0976  -0.6818  -0.2706   0.5747   3.1482  
## 
## Coefficients:
##                            Estimate Std. Error z value
## (Intercept)                 0.37359    0.13987   2.671
## ContractOne year           -0.74028    0.12936  -5.723
## ContractTwo year           -1.54618    0.22422  -6.896
## InternetServiceFiber optic  0.94039    0.09653   9.742
## InternetServiceNo          -1.23815    0.17201  -7.198
## OnlineSecurityYes          -0.72883    0.10876  -6.701
## TechSupportYes             -0.57426    0.11549  -4.973
## MultipleLinesYes            0.36525    0.09140   3.996
## tenure_DISC02 <= 5         -0.84029    0.15109  -5.561
## tenure_DISC03 <= 16        -1.45233    0.14806  -9.809
## tenure_DISC04 <= 22        -1.78785    0.18353  -9.741
## tenure_DISC05 <= 49        -2.20913    0.15936 -13.863
## tenure_DISC06 <= 59        -2.53764    0.21491 -11.808
## tenure_DISC07 <= 70        -2.80509    0.23136 -12.124
## tenure_DISC08 > 70         -4.18513    0.47817  -8.752
## PRODUCTOS_CONTRATADOS1      0.40320    0.13623   2.960
## PRODUCTOS_CONTRATADOS2      0.49484    0.14619   3.385
## PRODUCTOS_CONTRATADOS3      0.58987    0.15763   3.742
## PRODUCTOS_CONTRATADOS4      0.97021    0.18576   5.223
## PRODUCTOS_CONTRATADOS5      1.09872    0.25390   4.327
## PRODUCTOS_CONTRATADOS6      1.61039    0.40260   4.000
##                                        Pr(>|z|)    
## (Intercept)                            0.007562 ** 
## ContractOne year              0.000000010487986 ***
## ContractTwo year              0.000000000005352 ***
## InternetServiceFiber optic < 0.0000000000000002 ***
## InternetServiceNo             0.000000000000611 ***
## OnlineSecurityYes             0.000000000020632 ***
## TechSupportYes                0.000000660747046 ***
## MultipleLinesYes              0.000064348315931 ***
## tenure_DISC02 <= 5            0.000000026766346 ***
## tenure_DISC03 <= 16        < 0.0000000000000002 ***
## tenure_DISC04 <= 22        < 0.0000000000000002 ***
## tenure_DISC05 <= 49        < 0.0000000000000002 ***
## tenure_DISC06 <= 59        < 0.0000000000000002 ***
## tenure_DISC07 <= 70        < 0.0000000000000002 ***
## tenure_DISC08 > 70         < 0.0000000000000002 ***
## PRODUCTOS_CONTRATADOS1                 0.003080 ** 
## PRODUCTOS_CONTRATADOS2                 0.000712 ***
## PRODUCTOS_CONTRATADOS3                 0.000182 ***
## PRODUCTOS_CONTRATADOS4        0.000000176067891 ***
## PRODUCTOS_CONTRATADOS5        0.000015089427150 ***
## PRODUCTOS_CONTRATADOS6        0.000063343132065 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5696.0  on 4942  degrees of freedom
## Residual deviance: 4047.1  on 4922  degrees of freedom
## AIC: 4089.1
## 
## Number of Fisher Scoring iterations: 6

Comprobamos que los coeficientes siguen la lógica del negocio según cada variable. Pasamos a comprobar el modelo en el conjunto de datos de validación (test)

# calculamos el pseudoR cuadrado
pr2_rl <- 1 -(rl$deviance / rl$null.deviance)
pr2_rl

## [1] 0.2894836

# creamos un vector de probabilidades aplicando el modelo al conjunto test
rl_predict<-predict(rl,test,type = 'response')

# y lo visualizamos gráficamente
plot(rl_predict~test$TARGET)

El siguiente paso es transformar la probabilidad en la decisión del cliente de abandonar la compañía. Con la función “umbrales” creamos diferentes umbrales de corte y seleccionamos el umbral que maximiza la F1

# creamos los umbrales y los visualizamos
umb_rl<-umbrales(test$TARGET,rl_predict)
umb_rl

##    umbral  acierto precision  cobertura         F1
## 1    0.05 52.89612  36.30363 96.6608084 52.7831094
## 2    0.10 61.36908  40.86022 93.4973638 56.8679850
## 3    0.15 67.01771  44.78261 90.5096661 59.9185573
## 4    0.20 71.61321  48.80952 86.4674868 62.3969562
## 5    0.25 73.43226  50.77093 81.0193322 62.4238321
## 6    0.30 76.06510  54.56954 72.4077329 62.2356495
## 7    0.35 78.07563  58.27124 68.7170475 63.0645161
## 8    0.40 79.27238  61.68385 63.0931459 62.3805387
## 9    0.45 79.55960  63.14815 59.9297012 61.4968440
## 10   0.50 79.99043  67.59907 50.9666081 58.1162325
## 11   0.55 79.55960  69.29348 44.8154657 54.4290288
## 12   0.60 78.74581  75.10040 32.8646749 45.7212714
## 13   0.65 78.41072  78.64078 28.4710018 41.8064516
## 14   0.70 76.68741  81.53846 18.6291740 30.3290415
## 15   0.75 76.20871  87.50000 14.7627417 25.2631579
## 16   0.80 74.53327  93.02326  7.0298770 13.0718954
## 17   0.85 73.33652  87.50000  2.4604569  4.7863248
## 18   0.90 72.80996 100.00000  0.1757469  0.3508772
## 19   0.95  0.95000   0.95000  0.9500000  0.9500000

# seleccionamos el que maximiza la F1 y lo almacenamos en "umbral_final_rl"
umbral_final_rl<-umb_rl[which.max(umb_rl$F1),1]
umbral_final_rl

## [1] 0.35

Evaluamos la matriz de confusion y las metricas con el umbral optimizado

# matriz de confusión
confusion(test$TARGET,rl_predict,umbral_final_rl)

##     
## real FALSE TRUE
##    0  1240  280
##    1   178  391

# métricas del modelo
rl_metricas<-filter(umb_rl,umbral==umbral_final_rl)
rl_metricas

##   umbral  acierto precision cobertura       F1
## 1   0.35 78.07563  58.27124  68.71705 63.06452

Evaluamos la ROC

#creamos el objeto prediction
rl_prediction<-prediction(rl_predict,test$TARGET)
#visualizamos la ROC
roc(rl_prediction)

Sacamos las metricas definitivas incluyendo el AUC

rl_metricas<-cbind(rl_metricas,AUC=round(auc(rl_prediction),2)*100)
print(t(rl_metricas))

##               [,1]
## umbral     0.35000
## acierto   78.07563
## precision 58.27124
## cobertura 68.71705
## F1        63.06452
## AUC       84.00000

4.4 Modelización con árbol de decisión

Creamos un primer modelo de árbol de decisión

Creamos el primer modelo

formula_ar <- formula
ar<-rpart(formula_ar, train, method = 'class', parms = list(
  split = "information"), 
  control = rpart.control(cp = 0.00001))

Revisamos donde el error de validacion cruzada empieza a crecer

printcp(ar)

## 
## Classification tree:
## rpart(formula = formula_ar, data = train, method = "class", parms = list(split = "information"), 
##     control = rpart.control(cp = 0.00001))
## 
## Variables actually used in tree construction:
##  [1] Contract              Dependents            DeviceProtection     
##  [4] InternetService       MonthlyCharges_DISC   MultipleLines        
##  [7] OnlineSecurity        PaymentMethod         PRODUCTOS_CONTRATADOS
## [10] SeniorCitizen         TechSupport           tenure_DISC          
## [13] TotalCharges_DISC    
## 
## Root node error: 1300/4943 = 0.263
## 
## n= 4943 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.06743590      0   1.00000 1.00000 0.023810
## 2  0.00807692      3   0.79769 0.79769 0.022020
## 3  0.00538462      6   0.77308 0.79538 0.021997
## 4  0.00346154      7   0.76769 0.78231 0.021862
## 5  0.00307692     12   0.74154 0.78154 0.021854
## 6  0.00230769     18   0.72077 0.77923 0.021830
## 7  0.00192308     21   0.71385 0.77154 0.021750
## 8  0.00153846     26   0.70308 0.77231 0.021758
## 9  0.00115385     36   0.68462 0.77923 0.021830
## 10 0.00107692     45   0.67231 0.77769 0.021814
## 11 0.00102564     50   0.66692 0.77385 0.021774
## 12 0.00076923     61   0.65538 0.77308 0.021766
## 13 0.00065934     69   0.64923 0.78077 0.021846
## 14 0.00051282     81   0.63923 0.78923 0.021934
## 15 0.00038462     84   0.63769 0.80154 0.022059
## 16 0.00025641     90   0.63538 0.81769 0.022220
## 17 0.00019231     99   0.63308 0.82385 0.022280
## 18 0.00001000    103   0.63231 0.82692 0.022310

plotcp(ar)

Empieza a crecer en 0.0015. Creamos un nuevo árbol con ese nivel de complejidad y limitando la profundidad a 7. Volvemos a comprobar la complejidad.

ar<-rpart(formula, train, method = 'class', parms = list(
  split = "information"), 
  control = rpart.control(cp = 0.0015,maxdepth = 7))

printcp(ar)

## 
## Classification tree:
## rpart(formula = formula, data = train, method = "class", parms = list(split = "information"), 
##     control = rpart.control(cp = 0.0015, maxdepth = 7))
## 
## Variables actually used in tree construction:
##  [1] Contract              InternetService       MultipleLines        
##  [4] OnlineSecurity        PaymentMethod         PRODUCTOS_CONTRATADOS
##  [7] SeniorCitizen         TechSupport           tenure_DISC          
## [10] TotalCharges_DISC    
## 
## Root node error: 1300/4943 = 0.263
## 
## n= 4943 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.0674359      0   1.00000 1.00000 0.023810
## 2 0.0080769      3   0.79769 0.79769 0.022020
## 3 0.0053846      6   0.77308 0.79615 0.022004
## 4 0.0034615      7   0.76769 0.78692 0.021910
## 5 0.0030769     12   0.74154 0.78538 0.021894
## 6 0.0023077     17   0.72385 0.77615 0.021798
## 7 0.0019231     18   0.72154 0.77308 0.021766
## 8 0.0015385     20   0.71769 0.77308 0.021766
## 9 0.0015000     21   0.71615 0.76769 0.021710

plotcp(ar)

Conseguimos con estos parámetros que el error cruzado no llegue a subir, así que seleccionamos este árbol como definitivo.

Creamos una visualización gráfica del árbol

rpart.plot(ar,type=2,extra = 7, under = TRUE,under.cex = 0.7,fallen.leaves=F,gap = 0,cex=0.2,yesno = 2,box.palette = "GnYlRd",branch.lty = 3)

Sacamos las reglas para que se puedan utilizar a la hora de implantar el árbol en un entorno de producción. Creamos también un dataframe con el nodo final de cada cliente para poder explotarlo posteriormente.

rpart.rules(ar,style = 'tall',cover = T)

## TARGET is 0.07 with cover 45% when
##     Contract is One year or Two year
## 
## TARGET is 0.18 with cover 14% when
##     Contract is Month-to-month
##     tenure_DISC is 03 <= 16 or 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is DSL or No
## 
## TARGET is 0.18 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is Fiber optic
##     PaymentMethod is Electronic check
##     OnlineSecurity is Yes
##     TotalCharges_DISC is 03 <= 5643.4 or 04 > 5643.4
## 
## TARGET is 0.24 with cover 0% when
##     Contract is Month-to-month
##     tenure_DISC is 02 <= 5 or 03 <= 16
##     InternetService is Fiber optic
##     OnlineSecurity is Yes
##     MultipleLines is Yes
##     PRODUCTOS_CONTRATADOS is 1 or 2 or 5
## 
## TARGET is 0.24 with cover 4% when
##     Contract is Month-to-month
##     tenure_DISC is 01 <= 1 or 02 <= 5
##     InternetService is No
## 
## TARGET is 0.27 with cover 0% when
##     Contract is Month-to-month
##     tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is Fiber optic
##     PaymentMethod is Electronic check
##     OnlineSecurity is Yes
##     TotalCharges_DISC is 02 <= 3233.85
##     TechSupport is Yes
## 
## TARGET is 0.29 with cover 7% when
##     Contract is Month-to-month
##     tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is Fiber optic
##     PaymentMethod is Bank transfer (automatic) or Credit card (automatic) or Mailed check
## 
## TARGET is 0.32 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 01 <= 1 or 02 <= 5
##     InternetService is DSL
##     TotalCharges_DISC is 02 <= 3233.85
##     SeniorCitizen is No
## 
## TARGET is 0.36 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is Fiber optic
##     PaymentMethod is Electronic check
##     OnlineSecurity is No
##     MultipleLines is Yes
##     TotalCharges_DISC is 04 > 5643.4
## 
## TARGET is 0.36 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 02 <= 5 or 03 <= 16
##     InternetService is Fiber optic
##     PaymentMethod is Mailed check
##     MultipleLines is No
## 
## TARGET is 0.38 with cover 0% when
##     Contract is Month-to-month
##     tenure_DISC is 01 <= 1
##     InternetService is Fiber optic
##     TechSupport is Yes
## 
## TARGET is 0.38 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 01 <= 1 or 02 <= 5
##     InternetService is DSL
##     TotalCharges_DISC is 01 <= 198
##     TechSupport is Yes
##     SeniorCitizen is No
## 
## TARGET is 0.43 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 02 <= 5 or 03 <= 16
##     InternetService is Fiber optic
##     PaymentMethod is Bank transfer (automatic) or Credit card (automatic) or Electronic check
##     MultipleLines is No
##     PRODUCTOS_CONTRATADOS is 0
## 
## TARGET is 0.44 with cover 2% when
##     Contract is Month-to-month
##     tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is Fiber optic
##     PaymentMethod is Electronic check
##     OnlineSecurity is No
##     MultipleLines is No
## 
## TARGET is 0.54 with cover 4% when
##     Contract is Month-to-month
##     tenure_DISC is 01 <= 1 or 02 <= 5
##     InternetService is DSL
##     TotalCharges_DISC is 01 <= 198
##     TechSupport is No
##     SeniorCitizen is No
## 
## TARGET is 0.59 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is Fiber optic
##     PaymentMethod is Electronic check
##     OnlineSecurity is Yes
##     TotalCharges_DISC is 02 <= 3233.85
##     TechSupport is No
## 
## TARGET is 0.59 with cover 5% when
##     Contract is Month-to-month
##     tenure_DISC is 04 <= 22 or 05 <= 49 or 06 <= 59 or 07 <= 70 or 08 > 70
##     InternetService is Fiber optic
##     PaymentMethod is Electronic check
##     OnlineSecurity is No
##     MultipleLines is Yes
##     TotalCharges_DISC is 02 <= 3233.85 or 03 <= 5643.4
## 
## TARGET is 0.62 with cover 4% when
##     Contract is Month-to-month
##     tenure_DISC is 02 <= 5 or 03 <= 16
##     InternetService is Fiber optic
##     PaymentMethod is Bank transfer (automatic) or Credit card (automatic) or Electronic check
##     MultipleLines is No
##     PRODUCTOS_CONTRATADOS is 1 or 2 or 3 or 4 or 5
## 
## TARGET is 0.67 with cover 0% when
##     Contract is Month-to-month
##     tenure_DISC is 02 <= 5 or 03 <= 16
##     InternetService is Fiber optic
##     OnlineSecurity is Yes
##     MultipleLines is Yes
##     PRODUCTOS_CONTRATADOS is 3 or 4
## 
## TARGET is 0.73 with cover 1% when
##     Contract is Month-to-month
##     tenure_DISC is 01 <= 1 or 02 <= 5
##     InternetService is DSL
##     SeniorCitizen is Yes
## 
## TARGET is 0.74 with cover 5% when
##     Contract is Month-to-month
##     tenure_DISC is 02 <= 5 or 03 <= 16
##     InternetService is Fiber optic
##     OnlineSecurity is No
##     MultipleLines is Yes
## 
## TARGET is 0.87 with cover 3% when
##     Contract is Month-to-month
##     tenure_DISC is 01 <= 1
##     InternetService is Fiber optic
##     TechSupport is No

ar_numnodos<-rpart.predict(ar,test,nn = T)
head(ar_numnodos)

##           0          1  nn
## 1 0.9329102 0.06708976   2
## 2 0.9329102 0.06708976   2
## 3 0.5660377 0.43396226 242
## 4 0.8207283 0.17927171  12
## 5 0.8207283 0.17927171  12
## 6 0.9329102 0.06708976   2

Calculamos los scorings y evaluamos el modelo

ar_predict<-predict(ar,test,type = 'prob')[,2]

Lo representamos gráficamente

plot(ar_predict~test$TARGET)

Con la funcion umbrales probamos diferentes cortes y seleccionamos el mejor

umb_ar<-umbrales(test$TARGET,ar_predict)
umb_ar

##    umbral  acierto precision cobertura       F1
## 1    0.05  0.05000   0.05000   0.05000  0.05000
## 2    0.10 66.44327  44.20035  88.40070 58.93380
## 3    0.15 66.44327  44.20035  88.40070 58.93380
## 4    0.20 75.39493  53.34957  76.97715 63.02158
## 5    0.25 76.83102  55.76662  72.23199 62.94028
## 6    0.30 78.55433  60.03317  63.62039 61.77474
## 7    0.35 78.98516  61.32404  61.86292 61.59230
## 8    0.40 78.98516  62.40458  57.46924 59.83532
## 9    0.45 79.27238  64.71861  52.54833 58.00194
## 10   0.50 79.27238  64.71861  52.54833 58.00194
## 11   0.55 79.12877  67.45407  45.16696 54.10526
## 12   0.60 79.36812  73.63014  37.78559 49.94193
## 13   0.65 78.55433  79.80296  28.47100 41.96891
## 14   0.70 78.26711  79.48718  27.24077 40.57592
## 15   0.75 75.44280  91.17647  10.89631 19.46625
## 16   0.80 75.44280  91.17647  10.89631 19.46625
## 17   0.85 75.44280  91.17647  10.89631 19.46625
## 18   0.90  0.90000   0.90000   0.90000  0.90000
## 19   0.95  0.95000   0.95000   0.95000  0.95000

umbral_final_ar<-umb_ar[which.max(umb_ar$F1),1]
umbral_final_ar

## [1] 0.2

Evaluamos la matriz de confusion y las metricas con el umbral optimizado

confusion(test$TARGET,ar_predict,umbral_final_ar)

##     
## real FALSE TRUE
##    0  1137  383
##    1   131  438

ar_metricas<-filter(umb_ar,umbral==umbral_final_ar)
ar_metricas

##   umbral  acierto precision cobertura       F1
## 1    0.2 75.39493  53.34957  76.97715 63.02158

Evaluamos la ROC

ar_prediction<-prediction(ar_predict,test$TARGET)
roc(ar_prediction)

Sacamos las metricas definitivas incluyendo el AUC

ar_metricas<-cbind(ar_metricas,AUC=round(auc(ar_prediction),2)*100)
print(t(ar_metricas))

##               [,1]
## umbral     0.20000
## acierto   75.39493
## precision 53.34957
## cobertura 76.97715
## F1        63.02158
## AUC       82.00000

4.5 Modelización con Random Forest

Creamos la fórmula y el modelo

formula_rf <- formula
rf<-randomForest(formula_rf,train,importance=T)
rf

## 
## Call:
##  randomForest(formula = formula_rf, data = train, importance = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 20.31%
## Confusion matrix:
##      0   1 class.error
## 0 3246 397   0.1089761
## 1  607 693   0.4669231

Visualizamos las variables mas importantes

varImpPlot(rf)

Unificamos la importancia en una variable agregada para tener una mejor visión

importancia <- importance(rf)[,3:4]
#normalizamos para poner las dos variables en la misma scala. los valores negativos son las que menos predicen y los positivos las que mas
importancia_norm <- as.data.frame(scale(importancia))
#creamos una unica variable como suma de las otras
importancia_norm <- importancia_norm %>% mutate(
  Variable = rownames(importancia_norm),
  Imp_tot = MeanDecreaseAccuracy + MeanDecreaseGini) %>%
  mutate(Imp_tot = Imp_tot + abs(min(Imp_tot))) %>% 
  arrange(desc(Imp_tot)) %>% 
  select(Variable,Imp_tot,MeanDecreaseAccuracy,MeanDecreaseGini)
#hacemos un grafico para ver la curva de caida de importancia
ggplot(importancia_norm, aes(reorder(Variable,-Imp_tot),Imp_tot)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90,size = 7))

importancia_norm

##                 Variable   Imp_tot MeanDecreaseAccuracy MeanDecreaseGini
## 1            tenure_DISC 5.4589505            1.2548982       2.01260946
## 2               Contract 4.8767226            1.2071498       1.47813000
## 3        InternetService 4.5465753            1.8665084       0.48862404
## 4    MonthlyCharges_DISC 2.8113730            0.5575725       0.06235779
## 5          PaymentMethod 2.4376864           -0.6233359       0.86957954
## 6      TotalCharges_DISC 2.4189672            0.3671306      -0.13960616
## 7  PRODUCTOS_CONTRATADOS 2.0024564           -0.6305298       0.44154350
## 8         OnlineSecurity 1.6392708            0.1996406      -0.75181265
## 9            TechSupport 1.1306378           -0.1872266      -0.87357843
## 10         MultipleLines 0.5687676           -0.7687999      -0.85387533
## 11         SeniorCitizen 0.3344596           -0.9487855      -0.90819766
## 12      DeviceProtection 0.2628890           -0.9820463      -0.94650749
## 13            Dependents 0.0000000           -1.3121762      -0.87926660

Escogemos desde “TechSupport” las variables con mayor importancia, a partir de 1.1 y creamos una nueva fórmula con esas variables.

a_mantener <- importancia_norm %>% 
  filter(Imp_tot > 1.1) %>% 
  select(Variable)
a_mantener <- as.character((a_mantener$Variable))
formula_rf <- reformulate(a_mantener,target)

Creamos de nuevo el modelo con las nueva fórmula

rf<-randomForest(formula_rf,train,importance=T)
rf

## 
## Call:
##  randomForest(formula = formula_rf, data = train, importance = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 20.98%
## Confusion matrix:
##      0   1 class.error
## 0 3172 471   0.1292890
## 1  566 734   0.4353846

Aplicamos el modelo al conjunto de test, generando un vector con las probabilidades (seleccionamos la segunda columna de la matriz que nos da como resultado el predict)

rf_predict<-predict(rf,test,type = 'prob')[,2]

Lo representamos gráficamente en un boxplot

plot(rf_predict~test$TARGET)

Probamos los diferentes cortes y seleccionamos el mejor.

umb_rf<-umbrales(test$TARGET,rf_predict)
umb_rf

##    umbral  acierto precision cobertura       F1
## 1    0.05 65.39014  43.10036  84.53427 57.09199
## 2    0.10 70.79943  47.83069  79.43761 59.70938
## 3    0.15 73.81522  51.30641  75.92267 61.23317
## 4    0.20 75.63427  53.90625  72.75923 61.92969
## 5    0.25 76.16084  54.96503  69.06854 61.21495
## 6    0.30 76.83102  56.48855  65.02636 60.45752
## 7    0.35 78.02776  59.16667  62.39016 60.73567
## 8    0.40 78.31498  60.24735  59.92970 60.08811
## 9    0.45 78.55433  61.26629  57.82074 59.49367
## 10   0.50 78.88942  62.59843  55.88752 59.05292
## 11   0.55 79.36812  64.49580  53.95431 58.75598
## 12   0.60 79.55960  67.23301  48.68190 56.47299
## 13   0.65 79.60747  69.58904  44.63972 54.38972
## 14   0.70 79.55960  70.52023  42.88225 53.33333
## 15   0.75 79.32025  72.02572  39.36731 50.90909
## 16   0.80 79.08090  75.58140  34.27065 47.15840
## 17   0.85 78.36285  76.00000  30.05272 43.07305
## 18   0.90 78.26711  80.42328  26.71353 40.10554
## 19   0.95 77.35759  84.28571  20.73814 33.28632

umbral_final_rf<-umb_rf[which.max(umb_rf$F1),1]
umbral_final_rf

## [1] 0.2

Evaluamos la matriz de confusion y las metricas con el umbral optimizado

confusion(test$TARGET,rf_predict,umbral_final_rf)

##     
## real FALSE TRUE
##    0  1166  354
##    1   155  414

rf_metricas<-filter(umb_rf,umbral==umbral_final_rf)
rf_metricas

##   umbral  acierto precision cobertura       F1
## 1    0.2 75.63427  53.90625  72.75923 61.92969

Evaluamos la ROC

#creamos el objeto prediction
rf_prediction<-prediction(rf_predict,test$TARGET)
#visualizamos la ROC
roc(rf_prediction)

Sacamos las metricas definitivas incluyendo el AUC

rf_metricas<-cbind(rf_metricas,AUC=round(auc(rf_prediction),2)*100)
print(t(rf_metricas))

##               [,1]
## umbral     0.20000
## acierto   75.63427
## precision 53.90625
## cobertura 72.75923
## F1        61.92969
## AUC       81.00000

5. EVALUACIÓN

Sacamos una tabla comparativa de las métricas de cada modelo

comparativa <- rbind(rl_metricas,ar_metricas,rf_metricas)
rownames(comparativa) <- c('Reg.Log.','Arb.Dec.','Rand.For.')
t(comparativa) #t simplemente transpone para leerlo mejor

##           Reg.Log. Arb.Dec. Rand.For.
## umbral     0.35000  0.20000   0.20000
## acierto   78.07563 75.39493  75.63427
## precision 58.27124 53.34957  53.90625
## cobertura 68.71705 76.97715  72.75923
## F1        63.06452 63.02158  61.92969
## AUC       84.00000 82.00000  81.00000

Y escribimos el scoring con el modelo que mejor funciona (regresión logística) en el dataframe original

df$SCORING_CHURN <- predict(rl,df,type = 'response')

Creamos un gráfico para ver la tasa de abandono real según el tramo de scoring que calcula el modelo.

#Creamos una funcion para visualizar el abandono real por percentiles de scoring
vis <- function(scoring,real) {
    #Preparar el dataframe de visualización
    vis_df <- data.frame(Scoring = scoring, Perc_Scoring = cut_number(scoring, 20), Real = real)
    levels(vis_df$Perc_Scoring) <- seq(from = 100,to = 5,by = -5)
    vis_gr <- vis_df %>% group_by(Perc_Scoring) %>% summarise(Tasa_Churn = mean(as.numeric(as.character(Real)))) %>% arrange(Perc_Scoring)
    #ordenar el factor para el gráfico
    vis_gr$Perc_Scoring <- factor(vis_gr$Perc_Scoring, levels = vis_gr$Perc_Scoring[order(vis_gr$Perc_Scoring, decreasing = T)])
    #hacemos el gráfico
    ggplot(vis_gr,aes(Perc_Scoring, Tasa_Churn)) + 
      geom_col(fill='grey') + 
      geom_hline(aes(yintercept =      mean(as.numeric(as.character(vis_df$Real)))), col = 'black') +
      labs(title = 'Abandono real por tramo de scoring', x = 'Tramo de scoring', y = 'Abandono real')
}
vis(df$SCORING_CHURN,df$TARGET)

Se puede afirmar que el modelo elegido es consistente porque la tasa de abandono real desciende a la vez que desciende el scoring calculado.

Ejercicio: Customer Churn

Curso DS4B

Tomás María Sánchez Elías

8/7/2019

1. IMPORTACIÓN Y MUESTREO

1.1 Preparación del entorno

1.2 Carga de datos

2. CALIDAD DE DATOS

2.1 Análisis exploratorio general y estadísticos básicos.

Conclusiones

2.2 Análisis de nulos / Análisis de ceros

Conclusiones

2.3 Análisis de atípicos

Conclusiones

2.5 Análisis longitudinal - Otros análisis

2.6 Acciones según resultados del análisis exploratorio

3. TRANSFORMACIÓN

3.1 Creación de la variable target y preselección de las variables independientes

3.2 Preselección de variables con Random Forest

3.3 Preselección de variables con Information Value

3.4 Preselección final

3.5. Creación de variables sintéticas

3.6 Discretización

3.7 Preparación variables independientes finales

4. MODELIZACIÓN

4.1 Funciones a utilizar

4.2 Creación de los conjuntos de entramiento y validación

4.3 Modelización con regresión logística

4.4 Modelización con árbol de decisión

4.5 Modelización con Random Forest

5. EVALUACIÓN