ANÁLISIS DE DATOS MASIVOS PARA EL NEGOCIO

Actividad 2 MODELOS DE CLASIFICACIÓN

library(readr)
base <- read_csv("C:/Users/Jamileth/Downloads/Actividad02_ADM/Actividad02_ADM/datos_teleco(1).csv")

## New names:
## Rows: 7032 Columns: 23
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): ID_cliente, Sexo, Socio, Empleado, Servicio_telefonico, Lineas_mul... dbl
## (5): ...1, Jubilado, Meses_alta, Gasto_mensual, Gasto_total
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

La base consta de 7.032 observaciones y 23 variables

#Verificamos que no existen datos faltantes en la base
which(is.na(base))

## integer(0)

# y lo comprobamos ya que todas las variables tienen 7.032 observaciones
str(base)

## spc_tbl_ [7,032 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1                  : num [1:7032] 1 2 3 4 5 6 7 8 9 10 ...
##  $ ID_cliente            : chr [1:7032] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ Sexo                  : chr [1:7032] "Female" "Male" "Male" "Male" ...
##  $ Jubilado              : num [1:7032] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Socio                 : chr [1:7032] "Yes" "No" "No" "No" ...
##  $ Empleado              : chr [1:7032] "No" "No" "No" "No" ...
##  $ Meses_alta            : num [1:7032] 1 34 2 45 2 8 22 10 28 62 ...
##  $ Servicio_telefonico   : chr [1:7032] "No" "Yes" "Yes" "No" ...
##  $ Lineas_multiples      : chr [1:7032] "No phone service" "No" "No" "No phone service" ...
##  $ Servicio_Internet     : chr [1:7032] "DSL" "DSL" "DSL" "DSL" ...
##  $ Seguridad_Online      : chr [1:7032] "No" "Yes" "Yes" "Yes" ...
##  $ CopiaSeguridad_Online : chr [1:7032] "Yes" "No" "Yes" "No" ...
##  $ Proteccion_dispositivo: chr [1:7032] "No" "Yes" "No" "Yes" ...
##  $ Soporte_tecnico       : chr [1:7032] "No" "No" "No" "Yes" ...
##  $ Television_carta      : chr [1:7032] "No" "No" "No" "No" ...
##  $ Peliculas_carta       : chr [1:7032] "No" "No" "No" "No" ...
##  $ Contrato              : chr [1:7032] "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ Factura_digital       : chr [1:7032] "Yes" "No" "Yes" "No" ...
##  $ Metodo_pago           : chr [1:7032] "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ Gasto_mensual         : num [1:7032] 29.9 57 53.9 42.3 70.7 ...
##  $ Gasto_total           : num [1:7032] 29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Abandono              : chr [1:7032] "No" "No" "Yes" "No" ...
##  $ meses_alta_cut        : chr [1:7032] "<1" "18-42" "1-6" "42-60" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   ID_cliente = col_character(),
##   ..   Sexo = col_character(),
##   ..   Jubilado = col_double(),
##   ..   Socio = col_character(),
##   ..   Empleado = col_character(),
##   ..   Meses_alta = col_double(),
##   ..   Servicio_telefonico = col_character(),
##   ..   Lineas_multiples = col_character(),
##   ..   Servicio_Internet = col_character(),
##   ..   Seguridad_Online = col_character(),
##   ..   CopiaSeguridad_Online = col_character(),
##   ..   Proteccion_dispositivo = col_character(),
##   ..   Soporte_tecnico = col_character(),
##   ..   Television_carta = col_character(),
##   ..   Peliculas_carta = col_character(),
##   ..   Contrato = col_character(),
##   ..   Factura_digital = col_character(),
##   ..   Metodo_pago = col_character(),
##   ..   Gasto_mensual = col_double(),
##   ..   Gasto_total = col_double(),
##   ..   Abandono = col_character(),
##   ..   meses_alta_cut = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

#Analizamos la composición de la variable dependiente
table(base$Abandono)

## 
##   No  Yes 
## 5163 1869

Modificamos variable dependiente a 0 y 1 library (dplyr)

base$Abandono <- ifelse (base$Abandono =="Yes",1,0)
table(base$Abandono)

## 
##    0    1 
## 5163 1869

prop.table(table(base$Abandono))

## 
##        0        1 
## 0.734215 0.265785

Creamos los dos submuestras una de entrenamiento (80%) y una de validación (20%), pero primero fijamos una semilla para que la muestra siempre que se ejecute sea la misma

library(caTools)
set.seed(12345)
division <- sample.split(base$Abandono, SplitRatio = 0.8)
entrenamiento <- subset (base, division==TRUE)
validacion <-  subset (base, division==FALSE)

Verificamos que está equilibrado

prop.table(table(entrenamiento$Abandono))

## 
##         0         1 
## 0.7342222 0.2657778

prop.table(table(validacion$Abandono))

## 
##         0         1 
## 0.7341862 0.2658138

CREAMOS EL MODELO LOGIT

Mdentrenamiento <- glm (Abandono ~ as.factor(Contrato) + Factura_digital + as.factor(Servicio_Internet) +                                            as.factor(Soporte_tecnico) + as.factor(CopiaSeguridad_Online) +
                                   as.factor(Television_carta) + Meses_alta, data=entrenamiento, family ="binomial")

summary(Mdentrenamiento)

## 
## Call:
## glm(formula = Abandono ~ as.factor(Contrato) + Factura_digital + 
##     as.factor(Servicio_Internet) + as.factor(Soporte_tecnico) + 
##     as.factor(CopiaSeguridad_Online) + as.factor(Television_carta) + 
##     Meses_alta, family = "binomial", data = entrenamiento)
## 
## Coefficients: (3 not defined because of singularities)
##                                                     Estimate Std. Error z value
## (Intercept)                                         -0.56881    0.09013  -6.311
## as.factor(Contrato)One year                         -0.77037    0.11617  -6.632
## as.factor(Contrato)Two year                         -1.51997    0.19289  -7.880
## Factura_digitalYes                                   0.47773    0.08189   5.834
## as.factor(Servicio_Internet)Fiber optic              0.98858    0.08419  11.743
## as.factor(Servicio_Internet)No                      -0.96371    0.13992  -6.888
## as.factor(Soporte_tecnico)No internet service             NA         NA      NA
## as.factor(Soporte_tecnico)Yes                       -0.39071    0.09361  -4.174
## as.factor(CopiaSeguridad_Online)No internet service       NA         NA      NA
## as.factor(CopiaSeguridad_Online)Yes                 -0.09640    0.08438  -1.142
## as.factor(Television_carta)No internet service            NA         NA      NA
## as.factor(Television_carta)Yes                       0.49610    0.08240   6.021
## Meses_alta                                          -0.03417    0.00236 -14.475
##                                                     Pr(>|z|)    
## (Intercept)                                         2.77e-10 ***
## as.factor(Contrato)One year                         3.32e-11 ***
## as.factor(Contrato)Two year                         3.27e-15 ***
## Factura_digitalYes                                  5.42e-09 ***
## as.factor(Servicio_Internet)Fiber optic              < 2e-16 ***
## as.factor(Servicio_Internet)No                      5.67e-12 ***
## as.factor(Soporte_tecnico)No internet service             NA    
## as.factor(Soporte_tecnico)Yes                       3.00e-05 ***
## as.factor(CopiaSeguridad_Online)No internet service       NA    
## as.factor(CopiaSeguridad_Online)Yes                    0.253    
## as.factor(Television_carta)No internet service            NA    
## as.factor(Television_carta)Yes                      1.74e-09 ***
## Meses_alta                                           < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6513.9  on 5624  degrees of freedom
## Residual deviance: 4763.8  on 5615  degrees of freedom
## AIC: 4783.8
## 
## Number of Fisher Scoring iterations: 6

Se observa que todas las variables son significativas, incluido el intercepto

#Hacemos la predicción
predicentrenamiento <- predict(Mdentrenamiento,type = "response" )

# Codificamos la predicción
predicentren <- ifelse (predicentrenamiento > 0.5, 1, 0)

MATRIZ DE CONFUSIÓN

library (caret)

## Cargando paquete requerido: ggplot2

## Cargando paquete requerido: lattice

confusionMatrix(as.factor(entrenamiento$Abandono), as.factor(predicentren))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3710  420
##          1  746  749
##                                           
##                Accuracy : 0.7927          
##                  95% CI : (0.7819, 0.8032)
##     No Information Rate : 0.7922          
##     P-Value [Acc > NIR] : 0.4685          
##                                           
##                   Kappa : 0.4292          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8326          
##             Specificity : 0.6407          
##          Pos Pred Value : 0.8983          
##          Neg Pred Value : 0.5010          
##              Prevalence : 0.7922          
##          Detection Rate : 0.6596          
##    Detection Prevalence : 0.7342          
##       Balanced Accuracy : 0.7367          
##                                           
##        'Positive' Class : 0               
##

Los datos reflejan que el modelo acierta el 79,27% de las veces

Realizamos la predicción con los datos de validación

predicval <- predict (Mdentrenamiento, newdata = validacion, type="response")
predicvalcodi<- ifelse (predicval > 0.5,1,0)
confusionMatrix(as.factor(validacion$Abandono), as.factor(predicvalcodi))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 932 101
##          1 199 175
##                                           
##                Accuracy : 0.7868          
##                  95% CI : (0.7644, 0.8079)
##     No Information Rate : 0.8038          
##     P-Value [Acc > NIR] : 0.9489          
##                                           
##                   Kappa : 0.4039          
##                                           
##  Mcnemar's Test P-Value : 2.14e-08        
##                                           
##             Sensitivity : 0.8240          
##             Specificity : 0.6341          
##          Pos Pred Value : 0.9022          
##          Neg Pred Value : 0.4679          
##              Prevalence : 0.8038          
##          Detection Rate : 0.6624          
##    Detection Prevalence : 0.7342          
##       Balanced Accuracy : 0.7291          
##                                           
##        'Positive' Class : 0               
##

Con los datos de validación la exactitud ha disminuido de 79,27% a 78,68%.

ARBÓL DE DECISIÓN

library (rpart)
library (rpart.plot)
MArbolentre <- rpart (Abandono ~ as.factor(Contrato) + Factura_digital + as.factor(Servicio_Internet) +                                            as.factor(Soporte_tecnico) + as.factor(CopiaSeguridad_Online) +
                                   as.factor(Television_carta) + Meses_alta, data=entrenamiento, method ="class")
summary(MArbolentre)

## Call:
## rpart(formula = Abandono ~ as.factor(Contrato) + Factura_digital + 
##     as.factor(Servicio_Internet) + as.factor(Soporte_tecnico) + 
##     as.factor(CopiaSeguridad_Online) + as.factor(Television_carta) + 
##     Meses_alta, data = entrenamiento, method = "class")
##   n= 5625 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.07068004      0 1.0000000 1.0000000 0.02216119
## 2 0.01000000      3 0.7879599 0.7966555 0.02049516
## 
## Variable importance
##              as.factor(Contrato)                       Meses_alta 
##                               29                               22 
##     as.factor(Servicio_Internet)       as.factor(Soporte_tecnico) 
##                               14                               13 
## as.factor(CopiaSeguridad_Online)      as.factor(Television_carta) 
##                               10                                9 
##                  Factura_digital 
##                                2 
## 
## Node number 1: 5625 observations,    complexity param=0.07068004
##   predicted class=0  expected loss=0.2657778  P(node) =1
##     class counts:  4130  1495
##    probabilities: 0.734 0.266 
##   left son=2 (2505 obs) right son=3 (3120 obs)
##   Primary splits:
##       as.factor(Contrato)              splits as  RLL,      improve=350.9502, (0 missing)
##       as.factor(Soporte_tecnico)       splits as  RLL,      improve=239.1210, (0 missing)
##       Meses_alta                       < 16.5 to the right, improve=238.6637, (0 missing)
##       as.factor(Servicio_Internet)     splits as  LRL,      improve=202.4936, (0 missing)
##       as.factor(CopiaSeguridad_Online) splits as  RLL,      improve=153.7134, (0 missing)
##   Surrogate splits:
##       Meses_alta                       < 37.5 to the right, agree=0.789, adj=0.525, (0 split)
##       as.factor(Soporte_tecnico)       splits as  RLL,      agree=0.715, adj=0.360, (0 split)
##       as.factor(CopiaSeguridad_Online) splits as  RLL,      agree=0.659, adj=0.234, (0 split)
##       as.factor(Television_carta)      splits as  RLL,      agree=0.634, adj=0.178, (0 split)
##       as.factor(Servicio_Internet)     splits as  RRL,      agree=0.619, adj=0.144, (0 split)
## 
## Node number 2: 2505 observations
##   predicted class=0  expected loss=0.06866267  P(node) =0.4453333
##     class counts:  2333   172
##    probabilities: 0.931 0.069 
## 
## Node number 3: 3120 observations,    complexity param=0.07068004
##   predicted class=0  expected loss=0.4240385  P(node) =0.5546667
##     class counts:  1797  1323
##    probabilities: 0.576 0.424 
##   left son=6 (1406 obs) right son=7 (1714 obs)
##   Primary splits:
##       as.factor(Servicio_Internet)     splits as  LRL,      improve=111.16290, (0 missing)
##       as.factor(Soporte_tecnico)       splits as  RLL,      improve= 81.00356, (0 missing)
##       Meses_alta                       < 5.5  to the right, improve= 60.96558, (0 missing)
##       as.factor(CopiaSeguridad_Online) splits as  RLR,      improve= 54.74515, (0 missing)
##       as.factor(Television_carta)      splits as  RLR,      improve= 54.74515, (0 missing)
##   Surrogate splits:
##       as.factor(Soporte_tecnico)       splits as  RLL,      agree=0.685, adj=0.302, (0 split)
##       as.factor(CopiaSeguridad_Online) splits as  RLR,      agree=0.685, adj=0.300, (0 split)
##       as.factor(Television_carta)      splits as  RLR,      agree=0.685, adj=0.300, (0 split)
##       Factura_digital                  splits as  LR,       agree=0.659, adj=0.243, (0 split)
##       Meses_alta                       < 6.5  to the left,  agree=0.594, adj=0.098, (0 split)
## 
## Node number 6: 1406 observations
##   predicted class=0  expected loss=0.2766714  P(node) =0.2499556
##     class counts:  1017   389
##    probabilities: 0.723 0.277 
## 
## Node number 7: 1714 observations,    complexity param=0.07068004
##   predicted class=1  expected loss=0.4550758  P(node) =0.3047111
##     class counts:   780   934
##    probabilities: 0.455 0.545 
##   left son=14 (945 obs) right son=15 (769 obs)
##   Primary splits:
##       Meses_alta                       < 13.5 to the right, improve=72.476800, (0 missing)
##       as.factor(Soporte_tecnico)       splits as  R-L,      improve=14.950820, (0 missing)
##       Factura_digital                  splits as  LR,       improve= 8.897683, (0 missing)
##       as.factor(CopiaSeguridad_Online) splits as  R-L,      improve= 7.151263, (0 missing)
##       as.factor(Television_carta)      splits as  L-R,      improve= 2.513415, (0 missing)
##   Surrogate splits:
##       as.factor(CopiaSeguridad_Online) splits as  R-L, agree=0.609, adj=0.127, (0 split)
##       as.factor(Television_carta)      splits as  R-L, agree=0.591, adj=0.088, (0 split)
## 
## Node number 14: 945 observations
##   predicted class=0  expected loss=0.4137566  P(node) =0.168
##     class counts:   554   391
##    probabilities: 0.586 0.414 
## 
## Node number 15: 769 observations
##   predicted class=1  expected loss=0.2938882  P(node) =0.1367111
##     class counts:   226   543
##    probabilities: 0.294 0.706

rpart.plot (MArbolentre)

predictarbol <- predict(MArbolentre, type = "class")
confusionMatrix(as.factor(entrenamiento$Abandono), as.factor(predictarbol))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3904  226
##          1  952  543
##                                           
##                Accuracy : 0.7906          
##                  95% CI : (0.7797, 0.8011)
##     No Information Rate : 0.8633          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.365           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8040          
##             Specificity : 0.7061          
##          Pos Pred Value : 0.9453          
##          Neg Pred Value : 0.3632          
##              Prevalence : 0.8633          
##          Detection Rate : 0.6940          
##    Detection Prevalence : 0.7342          
##       Balanced Accuracy : 0.7550          
##                                           
##        'Positive' Class : 0               
##

La exactitud del modelo es de 79,06% (Accuracy)

Realizamos la predicción con los datos de validación

predicarbolval <- predict (MArbolentre, newdata = validacion, type="class")

confusionMatrix(as.factor(validacion$Abandono), as.factor(predicarbolval))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 973  60
##          1 244 130
##                                           
##                Accuracy : 0.7839          
##                  95% CI : (0.7615, 0.8052)
##     No Information Rate : 0.865           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3434          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7995          
##             Specificity : 0.6842          
##          Pos Pred Value : 0.9419          
##          Neg Pred Value : 0.3476          
##              Prevalence : 0.8650          
##          Detection Rate : 0.6915          
##    Detection Prevalence : 0.7342          
##       Balanced Accuracy : 0.7419          
##                                           
##        'Positive' Class : 0               
##

La exactitud del modelo con los datos de validación bajo de 79,06% a 78,39%.

ANÁLISIS DE DATOS MASIVOS PARA EL NEGOCIO

Actividad 2 MODELOS DE CLASIFICACIÓN

CONCLUSIÓN