library(foreign)
library(modelr)
library(dplyr)## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse) ## Warning: package 'ggplot2' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.5.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(broom)##
## Attaching package: 'broom'
##
## The following object is masked from 'package:modelr':
##
## bootstrap
library(ISLR)
library(readr)
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(class)
library(ROCR)
library(pROC)## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(lmtest)## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(caTools)
library(rpart)## Warning: package 'rpart' was built under R version 4.2.3
library(rpart.plot)## Warning: package 'rpart.plot' was built under R version 4.2.3
library(psych) ##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggpubr)
library(reshape)##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:class':
##
## condense
##
## The following object is masked from 'package:lubridate':
##
## stamp
##
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
##
## The following object is masked from 'package:dplyr':
##
## rename
library(Metrics)##
## Attaching package: 'Metrics'
##
## The following object is masked from 'package:pROC':
##
## auc
##
## The following objects are masked from 'package:caret':
##
## precision, recall
##
## The following objects are masked from 'package:modelr':
##
## mae, mape, mse, rmse
library(mlbench)
library(rsample)##
## Attaching package: 'rsample'
##
## The following object is masked from 'package:e1071':
##
## permutations
library(cluster) ## Warning: package 'cluster' was built under R version 4.2.3
library(factoextra) ## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra) ##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(modeldata)## Warning: package 'modeldata' was built under R version 4.2.3
library(klaR)## Warning: package 'klaR' was built under R version 4.2.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(naivebayes)## naivebayes 0.9.7 loaded
churn_data_a <- read.csv("/Users/gabrielmedina/Downloads/Quiz_3/insurance_churn.csv")
churn_data_b <- read.csv("/Users/gabrielmedina/Downloads/Quiz_3/insurance_churn.csv")### dataset characteristics
str(churn_data_a)## 'data.frame': 5000 obs. of 9 variables:
## $ member_id : int 11718698 24449305 92334825 99286533 63192693 13252492 60108914 64755437 92095594 33580274 ...
## $ loyalty_years : int 1 18 13 4 3 5 18 10 2 3 ...
## $ vehicles_covered : int 1 3 3 1 1 2 3 1 2 4 ...
## $ premium_plan_ind : int 0 0 0 1 0 0 0 0 0 0 ...
## $ mobile_app_user : int 0 0 1 1 1 1 0 1 0 0 ...
## $ home_auto_bundle : int 1 0 1 1 0 0 0 0 0 0 ...
## $ auto_pay_ind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ recent_rate_increase: int 0 0 0 0 1 0 0 0 0 1 ...
## $ churn : int 0 0 0 1 0 0 0 0 0 0 ...
prop.table(table(churn_data_a$churn))##
## 0 1
## 0.8492 0.1508
### transforming dataset's variables
churn_data_a <- subset(churn_data_a, select = -member_id)
churn_data_b <- subset(churn_data_b, select = -member_id)churn_data_b$churn[churn_data_b$churn == 1] = "Yes"
churn_data_b$churn[churn_data_b$churn == 0] = "No"churn_data_b$churn <- as.factor(churn_data_b$churn)
churn_data_b$premium_plan_ind_a <- as.factor(churn_data_b$premium_plan_ind)
summary(churn_data_b)## loyalty_years vehicles_covered premium_plan_ind mobile_app_user
## Min. : 0.000 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 5.000 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :10.000 Median :2.000 Median :0.0000 Median :1.0000
## Mean : 9.927 Mean :2.029 Mean :0.1008 Mean :0.5366
## 3rd Qu.:15.000 3rd Qu.:3.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :20.000 Max. :4.000 Max. :1.0000 Max. :1.0000
## home_auto_bundle auto_pay_ind recent_rate_increase churn
## Min. :0.0000 Min. :0.0000 Min. :0.0000 No :4246
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 Yes: 754
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2758 Mean :0.1796 Mean :0.1402
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## premium_plan_ind_a
## 0:4496
## 1: 504
##
##
##
##
### cross_data validation --> quantitative dataset
set.seed(123)
sample_a <- sample(c(TRUE, FALSE), nrow(churn_data_a), replace = T, prob = c(0.6,0.4))
train_a <- churn_data_a[sample_a, ]
test_a <- churn_data_a[!sample_a, ]### cross_data validation --> quantitative & qualitative dataset
set.seed(123)
sample_b <- sample(c(TRUE, FALSE), nrow(churn_data_b), replace = T, prob = c(0.6,0.4))
train_b <- churn_data_b[sample_b, ]
test_b <- churn_data_b[!sample_b, ]describe(churn_data_a)## vars n mean sd median trimmed mad min max range
## loyalty_years 1 5000 9.93 5.80 10 9.92 7.41 0 20 20
## vehicles_covered 2 5000 2.03 0.74 2 2.01 0.00 1 4 3
## premium_plan_ind 3 5000 0.10 0.30 0 0.00 0.00 0 1 1
## mobile_app_user 4 5000 0.54 0.50 1 0.55 0.00 0 1 1
## home_auto_bundle 5 5000 0.28 0.45 0 0.22 0.00 0 1 1
## auto_pay_ind 6 5000 0.18 0.38 0 0.10 0.00 0 1 1
## recent_rate_increase 7 5000 0.14 0.35 0 0.05 0.00 0 1 1
## churn 8 5000 0.15 0.36 0 0.06 0.00 0 1 1
## skew kurtosis se
## loyalty_years 0.01 -1.16 0.08
## vehicles_covered 0.22 -0.48 0.01
## premium_plan_ind 2.65 5.03 0.00
## mobile_app_user -0.15 -1.98 0.01
## home_auto_bundle 1.00 -0.99 0.01
## auto_pay_ind 1.67 0.79 0.01
## recent_rate_increase 2.07 2.29 0.00
## churn 1.95 1.81 0.01
moda <- function(x) {
tabla_frecuencias <- table(x)
modas <- names(tabla_frecuencias[tabla_frecuencias == max(tabla_frecuencias)])
if (is.numeric(x)) {
modas <- as.numeric(modas)
}
return(modas)
}
# Usar sapply para aplicar la función moda a cada columna
modas_por_columna <- sapply(churn_data_a, moda)
# Mostrar las modas de cada columna
modas_por_columna## loyalty_years vehicles_covered premium_plan_ind
## 8 2 0
## mobile_app_user home_auto_bundle auto_pay_ind
## 1 0 0
## recent_rate_increase churn
## 0 0
hist(churn_data_a$loyalty_years,
main = "Histograma de Años",
xlab = "Años",
ylab = "Frecuencia",
col = "blue",
border = "black")hist(churn_data_a$churn,
main = "Histograma de churn",
xlab = "churn",
ylab = "Frecuencia",
col = "red",
border = "black")hist(churn_data_a$vehicles_covered,
main = "Histograma de vehículos cubiertos",
xlab = "Vehículos",
ylab = "Frecuencia",
col = "lightblue",
border = "black")ggplot(churn_data_a, aes(x=factor(loyalty_years), y=churn)) +
geom_point(position = position_jitter(width = 0.2), aes(color=factor(loyalty_years)), alpha=0.5) +
labs(title="Gastos vs. Número de Hijos", x="Número de Hijos", y="Gastos Médicos") +
theme_minimal()library(ggcorrplot)
# Correlación entre variables
# Correlation
model.matrix(~0+., data=churn_data_a) %>%
cor(use="pairwise.complete.obs") %>%
ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=2,
colors = c("#1B9E77", "white", "#D95F02")) simple_logistic <- glm(churn ~ loyalty_years, family = "binomial", data = train_a)
summary(simple_logistic)##
## Call:
## glm(formula = churn ~ loyalty_years, family = "binomial", data = train_a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7885 -0.6489 -0.5297 -0.4298 2.2641
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.008785 0.090260 -11.176 < 2e-16 ***
## loyalty_years -0.073704 0.009049 -8.145 3.8e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2621.8 on 3010 degrees of freedom
## Residual deviance: 2552.2 on 3009 degrees of freedom
## AIC: 2556.2
##
## Number of Fisher Scoring iterations: 4
test_predictions_sl <- predict(simple_logistic, newdata = test_a, type = "response")
predicted_classes_sl <- ifelse(test_predictions_sl > 0.5, 1, 0)
conf_sl <- confusionMatrix(as.factor(predicted_classes_sl), as.factor(test_a$churn))
print(conf_sl)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1709 280
## 1 0 0
##
## Accuracy : 0.8592
## 95% CI : (0.8432, 0.8742)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.5159
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8592
## Neg Pred Value : NaN
## Prevalence : 0.8592
## Detection Rate : 0.8592
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
roc_sl <- roc(as.numeric(as.character(test_a$churn)), test_predictions_sl)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_sl)auc_sl=roc_sl$auc
auc_sl## Area under the curve: 0.6049
multiple_logistic <- glm(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, family = "binomial", data = train_a)
summary(multiple_logistic)##
## Call:
## glm(formula = churn ~ loyalty_years + premium_plan_ind + vehicles_covered +
## recent_rate_increase, family = "binomial", data = train_a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1233 -0.6288 -0.5085 -0.3950 2.4893
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.605278 0.167308 -3.618 0.000297 ***
## loyalty_years -0.075880 0.009149 -8.294 < 2e-16 ***
## premium_plan_ind -0.423537 0.189851 -2.231 0.025688 *
## vehicles_covered -0.232327 0.069808 -3.328 0.000875 ***
## recent_rate_increase 0.709010 0.130197 5.446 5.16e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2621.8 on 3010 degrees of freedom
## Residual deviance: 2508.8 on 3006 degrees of freedom
## AIC: 2518.8
##
## Number of Fisher Scoring iterations: 4
varImp(multiple_logistic) # which explanatory variable (X) is the most influential in predicting the target / dependent variable (Y)## Overall
## loyalty_years 8.293842
## premium_plan_ind 2.230892
## vehicles_covered 3.328068
## recent_rate_increase 5.445659
years_prueba <- tibble(premium_plan_ind = 0, vehicles_covered = 1, recent_rate_increase=1, loyalty_years = c(1, 10))
predict(multiple_logistic, years_prueba, type = "response")## 1 2
## 0.4490587 0.2916477
La probabilidad en el segundo caso disminuye del 44% al 29$ de churn.
years_prueba_2 <- tibble(premium_plan_ind = 0, vehicles_covered = 1, recent_rate_increase=1, loyalty_years = c(11, 22))
predict(multiple_logistic, years_prueba_2, type = "response")## 1 2
## 0.2762229 0.1421001
La probabilidad de churn disminuye del 27.6% al 14.2%
Esta negativamente relacionada, es decir, mientras más aumente los años de lealtad menos será la probabilidad de churn.
Matriz de confusión
test_predictions <- predict(multiple_logistic, newdata = test_a, type = "response")
predicted_classes <- ifelse(test_predictions > 0.5, 1, 0)
conf_ml <- confusionMatrix(as.factor(predicted_classes), as.factor(test_a$churn))
print(conf_ml)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1709 280
## 1 0 0
##
## Accuracy : 0.8592
## 95% CI : (0.8432, 0.8742)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.5159
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8592
## Neg Pred Value : NaN
## Prevalence : 0.8592
## Detection Rate : 0.8592
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
ROE CURVE
roc_obj <- roc(as.numeric(as.character(test_a$churn)), test_predictions)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)AUC
# AUC - Multiple Logistic Regression Model
auc_ml<- roc_obj$auc
auc_ml## Area under the curve: 0.6223
svm_model_1 <- svm(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, data = train_b,type = "C-classification",cost = 1, kernel = "linear",scale = FALSE)svm_predictions <- predict(svm_model_1, newdata = test_b)
# Generar y mostrar la matriz de confusión
conf_svm <- confusionMatrix(svm_predictions, test_b$churn, positive = "Yes")
# Imprimir la matriz de confusión y las métricas
print(conf_svm)## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1709 280
## Yes 0 0
##
## Accuracy : 0.8592
## 95% CI : (0.8432, 0.8742)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.5159
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8592
## Prevalence : 0.1408
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
naive_bayes_model <- naiveBayes(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, data = train_a)naive_bayes_predictions <- predict(naive_bayes_model, newdata = test_a)
conf_nb <- confusionMatrix(as.factor(naive_bayes_predictions), as.factor(test_a$churn))
print(conf_nb)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1635 245
## 1 74 35
##
## Accuracy : 0.8396
## 95% CI : (0.8227, 0.8555)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.9939
##
## Kappa : 0.1097
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9567
## Specificity : 0.1250
## Pos Pred Value : 0.8697
## Neg Pred Value : 0.3211
## Prevalence : 0.8592
## Detection Rate : 0.8220
## Detection Prevalence : 0.9452
## Balanced Accuracy : 0.5408
##
## 'Positive' Class : 0
##
naive_bayes_prob <- predict(naive_bayes_model, newdata = test_a, type = "raw")[,2]
# Crear el objeto ROC
roc_nb <- roc(response = test_a$churn, predictor = naive_bayes_prob)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Trazar la curva ROC
plot(roc_nb, main="Curva ROC para el Modelo de Naive Bayes", col="#1c61b6")auc_nb=roc_nb$auc
auc_nb## Area under the curve: 0.6055
#SL
sensitivity_sl <- conf_sl$byClass["Sensitivity"]
accuracy_sl <- conf_sl$overall["Accuracy"]
precision_sl <- conf_sl$byClass["Positive Predictive Value"]
kappa_sl <- conf_sl$overall["Kappa"]
f1_score_sl <- 2 * (precision_sl * sensitivity_sl) / (precision_sl + sensitivity_sl)
auc_sl## Area under the curve: 0.6049
#ML
sensitivity_ml <- conf_ml$byClass["Sensitivity"]
accuracy_ml <- conf_ml$overall["Accuracy"]
kappa_ml <- conf_ml$overall["Kappa"]
precision_ml <- conf_ml$byClass["Positive Predictive Value"]
f1_score_ml <- 2 * (precision_ml * sensitivity_ml) / (precision_ml + sensitivity_ml)
auc_ml## Area under the curve: 0.6223
#NB
sensitivity_nb <- conf_nb$byClass["Sensitivity"]
accuracy_nb <- conf_nb$overall["Accuracy"]
kappa_nb <- conf_nb$overall["Kappa"]
precision_nb <- conf_nb$byClass["Positive Predictive Value"]
f1_score_nb <- 2 * (precision_nb * sensitivity_nb) / (precision_nb + sensitivity_nb)
auc_nb## Area under the curve: 0.6055
#svm
sensitivity_svm <- conf_svm$byClass["Sensitivity"]
accuracy_svm <- conf_svm$overall["Accuracy"]
kappa_svm <- conf_svm$overall["Kappa"]
precision_svm <- conf_svm$byClass["Positive Predictive Value"]
f1_score_svm <- 2 * (precision_svm * sensitivity_svm) / (precision_svm + sensitivity_svm)# Crear un data.frame para organizar las métricas de los modelos
metrics_table <- data.frame(
Model = c("Simple Logistic", "Multiple Logistic", "Naive Bayes", "SVM"),
Sensitivity = c(sensitivity_sl, sensitivity_ml, sensitivity_nb, sensitivity_svm),
Accuracy = c(accuracy_sl, accuracy_ml, accuracy_nb, accuracy_svm),
Kappa = c(kappa_sl, kappa_ml, kappa_nb, kappa_svm),
Precision = c(precision_sl, precision_ml, precision_nb, precision_svm),
F1_Score = c(f1_score_sl, f1_score_ml, f1_score_nb, f1_score_svm),
AUC = c(auc_sl, auc_ml, auc_nb, NA) # Reemplaza NA con auc_svm si está disponible
)
# Imprimir la tabla
print(metrics_table)## Model Sensitivity Accuracy Kappa Precision F1_Score
## 1 Simple Logistic 1.0000000 0.8592257 0.0000000 NA NA
## 2 Multiple Logistic 1.0000000 0.8592257 0.0000000 NA NA
## 3 Naive Bayes 0.9566998 0.8396179 0.1097125 NA NA
## 4 SVM 0.0000000 0.8592257 0.0000000 NA NA
## AUC
## 1 0.6049183
## 2 0.6222739
## 3 0.6054972
## 4 NA
Como podemos apreciar el modelo de Multiple logistic es el modelo más apropiado para clasificar nuestros datos, ya que el AUC es mayor y la sensibilidad es la más alta, lo cual es muy importante ya que los falsos negativos son sumamente críticos en este caso de estudio, además, este modelo es mucho más fácil de interpretar que los demás modelos. Por otro lado, el Accuracy de Multiple Logistic es el más alto de todos los modelos junto con Simple Logistic.