Importación de la base de datos

library(foreign)
library(modelr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse) 
## Warning: package 'ggplot2' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.5.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(broom)
## 
## Attaching package: 'broom'
## 
## The following object is masked from 'package:modelr':
## 
##     bootstrap
library(ISLR)          
library(readr)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
library(class)
library(ROCR)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(caTools)
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(psych)  
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggpubr)
library(reshape)
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:class':
## 
##     condense
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
## 
## The following object is masked from 'package:dplyr':
## 
##     rename
library(Metrics)
## 
## Attaching package: 'Metrics'
## 
## The following object is masked from 'package:pROC':
## 
##     auc
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
## 
## The following objects are masked from 'package:modelr':
## 
##     mae, mape, mse, rmse
library(mlbench)
library(rsample)
## 
## Attaching package: 'rsample'
## 
## The following object is masked from 'package:e1071':
## 
##     permutations
library(cluster)    
## Warning: package 'cluster' was built under R version 4.2.3
library(factoextra)  
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra)   
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(modeldata)
## Warning: package 'modeldata' was built under R version 4.2.3
library(klaR)
## Warning: package 'klaR' was built under R version 4.2.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(naivebayes)
## naivebayes 0.9.7 loaded
churn_data_a <- read.csv("/Users/gabrielmedina/Downloads/Quiz_3/insurance_churn.csv")
churn_data_b <- read.csv("/Users/gabrielmedina/Downloads/Quiz_3/insurance_churn.csv")

Limpieza de la base de datos

### dataset characteristics 
str(churn_data_a)
## 'data.frame':    5000 obs. of  9 variables:
##  $ member_id           : int  11718698 24449305 92334825 99286533 63192693 13252492 60108914 64755437 92095594 33580274 ...
##  $ loyalty_years       : int  1 18 13 4 3 5 18 10 2 3 ...
##  $ vehicles_covered    : int  1 3 3 1 1 2 3 1 2 4 ...
##  $ premium_plan_ind    : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ mobile_app_user     : int  0 0 1 1 1 1 0 1 0 0 ...
##  $ home_auto_bundle    : int  1 0 1 1 0 0 0 0 0 0 ...
##  $ auto_pay_ind        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ recent_rate_increase: int  0 0 0 0 1 0 0 0 0 1 ...
##  $ churn               : int  0 0 0 1 0 0 0 0 0 0 ...
prop.table(table(churn_data_a$churn))
## 
##      0      1 
## 0.8492 0.1508
### transforming dataset's variables 
churn_data_a <- subset(churn_data_a, select = -member_id)
churn_data_b <- subset(churn_data_b, select = -member_id)
churn_data_b$churn[churn_data_b$churn == 1] = "Yes"
churn_data_b$churn[churn_data_b$churn == 0] = "No"
churn_data_b$churn <- as.factor(churn_data_b$churn)
churn_data_b$premium_plan_ind_a <- as.factor(churn_data_b$premium_plan_ind)
summary(churn_data_b)
##  loyalty_years    vehicles_covered premium_plan_ind mobile_app_user 
##  Min.   : 0.000   Min.   :1.000    Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 5.000   1st Qu.:2.000    1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :10.000   Median :2.000    Median :0.0000   Median :1.0000  
##  Mean   : 9.927   Mean   :2.029    Mean   :0.1008   Mean   :0.5366  
##  3rd Qu.:15.000   3rd Qu.:3.000    3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :20.000   Max.   :4.000    Max.   :1.0000   Max.   :1.0000  
##  home_auto_bundle  auto_pay_ind    recent_rate_increase churn     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000       No :4246  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000       Yes: 754  
##  Median :0.0000   Median :0.0000   Median :0.0000                 
##  Mean   :0.2758   Mean   :0.1796   Mean   :0.1402                 
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000                 
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000                 
##  premium_plan_ind_a
##  0:4496            
##  1: 504            
##                    
##                    
##                    
## 
### cross_data validation --> quantitative dataset
set.seed(123)
sample_a <- sample(c(TRUE, FALSE), nrow(churn_data_a), replace = T, prob = c(0.6,0.4))
train_a  <- churn_data_a[sample_a, ]
test_a   <- churn_data_a[!sample_a, ]
### cross_data validation --> quantitative & qualitative dataset
set.seed(123)
sample_b <- sample(c(TRUE, FALSE), nrow(churn_data_b), replace = T, prob = c(0.6,0.4))
train_b  <- churn_data_b[sample_b, ]
test_b   <- churn_data_b[!sample_b, ]

Exploratory data analysis (EDA)

describe(churn_data_a)
##                      vars    n mean   sd median trimmed  mad min max range
## loyalty_years           1 5000 9.93 5.80     10    9.92 7.41   0  20    20
## vehicles_covered        2 5000 2.03 0.74      2    2.01 0.00   1   4     3
## premium_plan_ind        3 5000 0.10 0.30      0    0.00 0.00   0   1     1
## mobile_app_user         4 5000 0.54 0.50      1    0.55 0.00   0   1     1
## home_auto_bundle        5 5000 0.28 0.45      0    0.22 0.00   0   1     1
## auto_pay_ind            6 5000 0.18 0.38      0    0.10 0.00   0   1     1
## recent_rate_increase    7 5000 0.14 0.35      0    0.05 0.00   0   1     1
## churn                   8 5000 0.15 0.36      0    0.06 0.00   0   1     1
##                       skew kurtosis   se
## loyalty_years         0.01    -1.16 0.08
## vehicles_covered      0.22    -0.48 0.01
## premium_plan_ind      2.65     5.03 0.00
## mobile_app_user      -0.15    -1.98 0.01
## home_auto_bundle      1.00    -0.99 0.01
## auto_pay_ind          1.67     0.79 0.01
## recent_rate_increase  2.07     2.29 0.00
## churn                 1.95     1.81 0.01
moda <- function(x) {
  tabla_frecuencias <- table(x)
  modas <- names(tabla_frecuencias[tabla_frecuencias == max(tabla_frecuencias)])
  if (is.numeric(x)) {
    modas <- as.numeric(modas)
  }
  return(modas)
}
# Usar sapply para aplicar la función moda a cada columna
modas_por_columna <- sapply(churn_data_a, moda)

# Mostrar las modas de cada columna
modas_por_columna
##        loyalty_years     vehicles_covered     premium_plan_ind 
##                    8                    2                    0 
##      mobile_app_user     home_auto_bundle         auto_pay_ind 
##                    1                    0                    0 
## recent_rate_increase                churn 
##                    0                    0
hist(churn_data_a$loyalty_years, 
     main = "Histograma de Años", 
     xlab = "Años", 
     ylab = "Frecuencia", 
     col = "blue", 
     border = "black")

hist(churn_data_a$churn, 
     main = "Histograma de churn", 
     xlab = "churn", 
     ylab = "Frecuencia", 
     col = "red", 
     border = "black")

hist(churn_data_a$vehicles_covered, 
     main = "Histograma de vehículos cubiertos", 
     xlab = "Vehículos", 
     ylab = "Frecuencia", 
     col = "lightblue", 
     border = "black")

ggplot(churn_data_a, aes(x=factor(loyalty_years), y=churn)) +
  geom_point(position = position_jitter(width = 0.2), aes(color=factor(loyalty_years)), alpha=0.5) +
  labs(title="Gastos vs. Número de Hijos", x="Número de Hijos", y="Gastos Médicos") +
  theme_minimal()

Matriz de correlación

library(ggcorrplot)
# Correlación entre variables
# Correlation
model.matrix(~0+., data=churn_data_a) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=2, 
             colors = c("#1B9E77", "white", "#D95F02")) 

Modelo de regresión logística simple

simple_logistic <- glm(churn ~ loyalty_years, family = "binomial", data = train_a)
summary(simple_logistic)
## 
## Call:
## glm(formula = churn ~ loyalty_years, family = "binomial", data = train_a)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7885  -0.6489  -0.5297  -0.4298   2.2641  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.008785   0.090260 -11.176  < 2e-16 ***
## loyalty_years -0.073704   0.009049  -8.145  3.8e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2621.8  on 3010  degrees of freedom
## Residual deviance: 2552.2  on 3009  degrees of freedom
## AIC: 2556.2
## 
## Number of Fisher Scoring iterations: 4
test_predictions_sl <- predict(simple_logistic, newdata = test_a, type = "response")


predicted_classes_sl <- ifelse(test_predictions_sl > 0.5, 1, 0)

conf_sl <- confusionMatrix(as.factor(predicted_classes_sl), as.factor(test_a$churn))
print(conf_sl)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1709  280
##          1    0    0
##                                           
##                Accuracy : 0.8592          
##                  95% CI : (0.8432, 0.8742)
##     No Information Rate : 0.8592          
##     P-Value [Acc > NIR] : 0.5159          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8592          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8592          
##          Detection Rate : 0.8592          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
roc_sl <- roc(as.numeric(as.character(test_a$churn)), test_predictions_sl)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_sl)

auc_sl=roc_sl$auc
auc_sl
## Area under the curve: 0.6049

Modelo de regresión logística múltiple

multiple_logistic <- glm(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, family = "binomial", data = train_a)
summary(multiple_logistic)
## 
## Call:
## glm(formula = churn ~ loyalty_years + premium_plan_ind + vehicles_covered + 
##     recent_rate_increase, family = "binomial", data = train_a)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1233  -0.6288  -0.5085  -0.3950   2.4893  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -0.605278   0.167308  -3.618 0.000297 ***
## loyalty_years        -0.075880   0.009149  -8.294  < 2e-16 ***
## premium_plan_ind     -0.423537   0.189851  -2.231 0.025688 *  
## vehicles_covered     -0.232327   0.069808  -3.328 0.000875 ***
## recent_rate_increase  0.709010   0.130197   5.446 5.16e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2621.8  on 3010  degrees of freedom
## Residual deviance: 2508.8  on 3006  degrees of freedom
## AIC: 2518.8
## 
## Number of Fisher Scoring iterations: 4
varImp(multiple_logistic) # which explanatory variable (X) is the most influential in predicting the target / dependent variable (Y)
##                       Overall
## loyalty_years        8.293842
## premium_plan_ind     2.230892
## vehicles_covered     3.328068
## recent_rate_increase 5.445659

As the customer’s loyalty moves from 1 to 10 years then how much the probability of churning increase?

years_prueba <- tibble(premium_plan_ind = 0, vehicles_covered = 1, recent_rate_increase=1, loyalty_years = c(1, 10))
predict(multiple_logistic, years_prueba, type = "response")
##         1         2 
## 0.4490587 0.2916477

La probabilidad en el segundo caso disminuye del 44% al 29$ de churn.

As the customer’s loyalty moves from 11 to 20 years then how much the probability of churning increase?

years_prueba_2 <- tibble(premium_plan_ind = 0, vehicles_covered = 1, recent_rate_increase=1, loyalty_years = c(11, 22))
predict(multiple_logistic, years_prueba_2, type = "response")
##         1         2 
## 0.2762229 0.1421001

La probabilidad de churn disminuye del 27.6% al 14.2%

How is the impact of customer’s loyalty on the probability of customer’s churning?

Esta negativamente relacionada, es decir, mientras más aumente los años de lealtad menos será la probabilidad de churn.

What about the probability of churning when considering a premium plan and a number of vehicles covered?

Al igual que los años de lealtad, estas dos variables son estadísticamente significativas y tienen un impacto negativo y fuerte en la posibilidad de churn como se puede ver en los coeficientes que arrojó el modelo. Por lo tanto, para ambos casos, mientras más aumente el número de vehículos cubiertos por el seguro y si tiene un plan premium el cliente, menos será la probabilidad de que se cliente deserte.

Diagnostics of simple multiple logistic regression

Matriz de confusión

test_predictions <- predict(multiple_logistic, newdata = test_a, type = "response")


predicted_classes <- ifelse(test_predictions > 0.5, 1, 0)

conf_ml <- confusionMatrix(as.factor(predicted_classes), as.factor(test_a$churn))
print(conf_ml)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1709  280
##          1    0    0
##                                           
##                Accuracy : 0.8592          
##                  95% CI : (0.8432, 0.8742)
##     No Information Rate : 0.8592          
##     P-Value [Acc > NIR] : 0.5159          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8592          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8592          
##          Detection Rate : 0.8592          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 

ROE CURVE

roc_obj <- roc(as.numeric(as.character(test_a$churn)), test_predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)

AUC

# AUC - Multiple Logistic Regression Model 
auc_ml<- roc_obj$auc
auc_ml
## Area under the curve: 0.6223

Support vector machine (SVM)

svm_model_1 <- svm(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, data = train_b,type = "C-classification",cost = 1, kernel = "linear",scale = FALSE)
svm_predictions <- predict(svm_model_1, newdata = test_b)

# Generar y mostrar la matriz de confusión
conf_svm <- confusionMatrix(svm_predictions, test_b$churn, positive = "Yes")

# Imprimir la matriz de confusión y las métricas
print(conf_svm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1709  280
##        Yes    0    0
##                                           
##                Accuracy : 0.8592          
##                  95% CI : (0.8432, 0.8742)
##     No Information Rate : 0.8592          
##     P-Value [Acc > NIR] : 0.5159          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8592          
##              Prevalence : 0.1408          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Yes             
## 

Naive Bayes

naive_bayes_model   <- naiveBayes(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, data = train_a)
naive_bayes_predictions <- predict(naive_bayes_model, newdata = test_a)



conf_nb <- confusionMatrix(as.factor(naive_bayes_predictions), as.factor(test_a$churn))


print(conf_nb)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1635  245
##          1   74   35
##                                           
##                Accuracy : 0.8396          
##                  95% CI : (0.8227, 0.8555)
##     No Information Rate : 0.8592          
##     P-Value [Acc > NIR] : 0.9939          
##                                           
##                   Kappa : 0.1097          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9567          
##             Specificity : 0.1250          
##          Pos Pred Value : 0.8697          
##          Neg Pred Value : 0.3211          
##              Prevalence : 0.8592          
##          Detection Rate : 0.8220          
##    Detection Prevalence : 0.9452          
##       Balanced Accuracy : 0.5408          
##                                           
##        'Positive' Class : 0               
## 
naive_bayes_prob <- predict(naive_bayes_model, newdata = test_a, type = "raw")[,2]

# Crear el objeto ROC
roc_nb <- roc(response = test_a$churn, predictor = naive_bayes_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Trazar la curva ROC
plot(roc_nb, main="Curva ROC para el Modelo de Naive Bayes", col="#1c61b6")

auc_nb=roc_nb$auc
auc_nb
## Area under the curve: 0.6055

Métricas de evaluación

#SL
sensitivity_sl <- conf_sl$byClass["Sensitivity"]
accuracy_sl <- conf_sl$overall["Accuracy"]
precision_sl <- conf_sl$byClass["Positive Predictive Value"]
kappa_sl <- conf_sl$overall["Kappa"]
f1_score_sl <- 2 * (precision_sl * sensitivity_sl) / (precision_sl + sensitivity_sl)

auc_sl
## Area under the curve: 0.6049
#ML

sensitivity_ml <- conf_ml$byClass["Sensitivity"]
accuracy_ml <- conf_ml$overall["Accuracy"]
kappa_ml <- conf_ml$overall["Kappa"]
precision_ml <- conf_ml$byClass["Positive Predictive Value"]
f1_score_ml <- 2 * (precision_ml * sensitivity_ml) / (precision_ml + sensitivity_ml)
auc_ml
## Area under the curve: 0.6223
#NB

sensitivity_nb <- conf_nb$byClass["Sensitivity"]
accuracy_nb <- conf_nb$overall["Accuracy"]
kappa_nb <- conf_nb$overall["Kappa"]
precision_nb <- conf_nb$byClass["Positive Predictive Value"]
f1_score_nb <- 2 * (precision_nb * sensitivity_nb) / (precision_nb + sensitivity_nb)
auc_nb
## Area under the curve: 0.6055
#svm

sensitivity_svm <- conf_svm$byClass["Sensitivity"]
accuracy_svm <- conf_svm$overall["Accuracy"]
kappa_svm <- conf_svm$overall["Kappa"]
precision_svm <- conf_svm$byClass["Positive Predictive Value"]
f1_score_svm <- 2 * (precision_svm * sensitivity_svm) / (precision_svm + sensitivity_svm)
# Crear un data.frame para organizar las métricas de los modelos
metrics_table <- data.frame(
  Model = c("Simple Logistic", "Multiple Logistic", "Naive Bayes", "SVM"),
  Sensitivity = c(sensitivity_sl, sensitivity_ml, sensitivity_nb, sensitivity_svm),
  Accuracy = c(accuracy_sl, accuracy_ml, accuracy_nb, accuracy_svm),
  Kappa = c(kappa_sl, kappa_ml, kappa_nb, kappa_svm),
  Precision = c(precision_sl, precision_ml, precision_nb, precision_svm),
  F1_Score = c(f1_score_sl, f1_score_ml, f1_score_nb, f1_score_svm),
  AUC = c(auc_sl, auc_ml, auc_nb, NA) # Reemplaza NA con auc_svm si está disponible
)

# Imprimir la tabla
print(metrics_table)
##               Model Sensitivity  Accuracy     Kappa Precision F1_Score
## 1   Simple Logistic   1.0000000 0.8592257 0.0000000        NA       NA
## 2 Multiple Logistic   1.0000000 0.8592257 0.0000000        NA       NA
## 3       Naive Bayes   0.9566998 0.8396179 0.1097125        NA       NA
## 4               SVM   0.0000000 0.8592257 0.0000000        NA       NA
##         AUC
## 1 0.6049183
## 2 0.6222739
## 3 0.6054972
## 4        NA

Como podemos apreciar el modelo de Multiple logistic es el modelo más apropiado para clasificar nuestros datos, ya que el AUC es mayor y la sensibilidad es la más alta, lo cual es muy importante ya que los falsos negativos son sumamente críticos en este caso de estudio, además, este modelo es mucho más fácil de interpretar que los demás modelos. Por otro lado, el Accuracy de Multiple Logistic es el más alto de todos los modelos junto con Simple Logistic.