library(foreign)
library(modelr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.5.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(broom)
##
## Attaching package: 'broom'
##
## The following object is masked from 'package:modelr':
##
## bootstrap
library(ISLR)
library(readr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(class)
library(ROCR)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(caTools)
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggpubr)
library(reshape)
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:class':
##
## condense
##
## The following object is masked from 'package:lubridate':
##
## stamp
##
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
##
## The following object is masked from 'package:dplyr':
##
## rename
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following object is masked from 'package:pROC':
##
## auc
##
## The following objects are masked from 'package:caret':
##
## precision, recall
##
## The following objects are masked from 'package:modelr':
##
## mae, mape, mse, rmse
library(mlbench)
library(rsample)
##
## Attaching package: 'rsample'
##
## The following object is masked from 'package:e1071':
##
## permutations
library(cluster)
## Warning: package 'cluster' was built under R version 4.2.3
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(modeldata)
## Warning: package 'modeldata' was built under R version 4.2.3
library(klaR)
## Warning: package 'klaR' was built under R version 4.2.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(naivebayes)
## naivebayes 0.9.7 loaded
<- read.csv("/Users/gabrielmedina/Downloads/Quiz_3/insurance_churn.csv")
churn_data_a <- read.csv("/Users/gabrielmedina/Downloads/Quiz_3/insurance_churn.csv") churn_data_b
### dataset characteristics
str(churn_data_a)
## 'data.frame': 5000 obs. of 9 variables:
## $ member_id : int 11718698 24449305 92334825 99286533 63192693 13252492 60108914 64755437 92095594 33580274 ...
## $ loyalty_years : int 1 18 13 4 3 5 18 10 2 3 ...
## $ vehicles_covered : int 1 3 3 1 1 2 3 1 2 4 ...
## $ premium_plan_ind : int 0 0 0 1 0 0 0 0 0 0 ...
## $ mobile_app_user : int 0 0 1 1 1 1 0 1 0 0 ...
## $ home_auto_bundle : int 1 0 1 1 0 0 0 0 0 0 ...
## $ auto_pay_ind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ recent_rate_increase: int 0 0 0 0 1 0 0 0 0 1 ...
## $ churn : int 0 0 0 1 0 0 0 0 0 0 ...
prop.table(table(churn_data_a$churn))
##
## 0 1
## 0.8492 0.1508
### transforming dataset's variables
<- subset(churn_data_a, select = -member_id)
churn_data_a <- subset(churn_data_b, select = -member_id) churn_data_b
$churn[churn_data_b$churn == 1] = "Yes"
churn_data_b$churn[churn_data_b$churn == 0] = "No" churn_data_b
$churn <- as.factor(churn_data_b$churn)
churn_data_b$premium_plan_ind_a <- as.factor(churn_data_b$premium_plan_ind)
churn_data_bsummary(churn_data_b)
## loyalty_years vehicles_covered premium_plan_ind mobile_app_user
## Min. : 0.000 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 5.000 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :10.000 Median :2.000 Median :0.0000 Median :1.0000
## Mean : 9.927 Mean :2.029 Mean :0.1008 Mean :0.5366
## 3rd Qu.:15.000 3rd Qu.:3.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :20.000 Max. :4.000 Max. :1.0000 Max. :1.0000
## home_auto_bundle auto_pay_ind recent_rate_increase churn
## Min. :0.0000 Min. :0.0000 Min. :0.0000 No :4246
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 Yes: 754
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2758 Mean :0.1796 Mean :0.1402
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## premium_plan_ind_a
## 0:4496
## 1: 504
##
##
##
##
### cross_data validation --> quantitative dataset
set.seed(123)
<- sample(c(TRUE, FALSE), nrow(churn_data_a), replace = T, prob = c(0.6,0.4))
sample_a <- churn_data_a[sample_a, ]
train_a <- churn_data_a[!sample_a, ] test_a
### cross_data validation --> quantitative & qualitative dataset
set.seed(123)
<- sample(c(TRUE, FALSE), nrow(churn_data_b), replace = T, prob = c(0.6,0.4))
sample_b <- churn_data_b[sample_b, ]
train_b <- churn_data_b[!sample_b, ] test_b
describe(churn_data_a)
## vars n mean sd median trimmed mad min max range
## loyalty_years 1 5000 9.93 5.80 10 9.92 7.41 0 20 20
## vehicles_covered 2 5000 2.03 0.74 2 2.01 0.00 1 4 3
## premium_plan_ind 3 5000 0.10 0.30 0 0.00 0.00 0 1 1
## mobile_app_user 4 5000 0.54 0.50 1 0.55 0.00 0 1 1
## home_auto_bundle 5 5000 0.28 0.45 0 0.22 0.00 0 1 1
## auto_pay_ind 6 5000 0.18 0.38 0 0.10 0.00 0 1 1
## recent_rate_increase 7 5000 0.14 0.35 0 0.05 0.00 0 1 1
## churn 8 5000 0.15 0.36 0 0.06 0.00 0 1 1
## skew kurtosis se
## loyalty_years 0.01 -1.16 0.08
## vehicles_covered 0.22 -0.48 0.01
## premium_plan_ind 2.65 5.03 0.00
## mobile_app_user -0.15 -1.98 0.01
## home_auto_bundle 1.00 -0.99 0.01
## auto_pay_ind 1.67 0.79 0.01
## recent_rate_increase 2.07 2.29 0.00
## churn 1.95 1.81 0.01
<- function(x) {
moda <- table(x)
tabla_frecuencias <- names(tabla_frecuencias[tabla_frecuencias == max(tabla_frecuencias)])
modas if (is.numeric(x)) {
<- as.numeric(modas)
modas
}return(modas)
}# Usar sapply para aplicar la función moda a cada columna
<- sapply(churn_data_a, moda)
modas_por_columna
# Mostrar las modas de cada columna
modas_por_columna
## loyalty_years vehicles_covered premium_plan_ind
## 8 2 0
## mobile_app_user home_auto_bundle auto_pay_ind
## 1 0 0
## recent_rate_increase churn
## 0 0
hist(churn_data_a$loyalty_years,
main = "Histograma de Años",
xlab = "Años",
ylab = "Frecuencia",
col = "blue",
border = "black")
hist(churn_data_a$churn,
main = "Histograma de churn",
xlab = "churn",
ylab = "Frecuencia",
col = "red",
border = "black")
hist(churn_data_a$vehicles_covered,
main = "Histograma de vehículos cubiertos",
xlab = "Vehículos",
ylab = "Frecuencia",
col = "lightblue",
border = "black")
ggplot(churn_data_a, aes(x=factor(loyalty_years), y=churn)) +
geom_point(position = position_jitter(width = 0.2), aes(color=factor(loyalty_years)), alpha=0.5) +
labs(title="Gastos vs. Número de Hijos", x="Número de Hijos", y="Gastos Médicos") +
theme_minimal()
library(ggcorrplot)
# Correlación entre variables
# Correlation
model.matrix(~0+., data=churn_data_a) %>%
cor(use="pairwise.complete.obs") %>%
ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=2,
colors = c("#1B9E77", "white", "#D95F02"))
<- glm(churn ~ loyalty_years, family = "binomial", data = train_a)
simple_logistic summary(simple_logistic)
##
## Call:
## glm(formula = churn ~ loyalty_years, family = "binomial", data = train_a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7885 -0.6489 -0.5297 -0.4298 2.2641
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.008785 0.090260 -11.176 < 2e-16 ***
## loyalty_years -0.073704 0.009049 -8.145 3.8e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2621.8 on 3010 degrees of freedom
## Residual deviance: 2552.2 on 3009 degrees of freedom
## AIC: 2556.2
##
## Number of Fisher Scoring iterations: 4
<- predict(simple_logistic, newdata = test_a, type = "response")
test_predictions_sl
<- ifelse(test_predictions_sl > 0.5, 1, 0)
predicted_classes_sl
<- confusionMatrix(as.factor(predicted_classes_sl), as.factor(test_a$churn))
conf_sl print(conf_sl)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1709 280
## 1 0 0
##
## Accuracy : 0.8592
## 95% CI : (0.8432, 0.8742)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.5159
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8592
## Neg Pred Value : NaN
## Prevalence : 0.8592
## Detection Rate : 0.8592
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
<- roc(as.numeric(as.character(test_a$churn)), test_predictions_sl) roc_sl
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_sl)
=roc_sl$auc
auc_sl auc_sl
## Area under the curve: 0.6049
<- glm(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, family = "binomial", data = train_a)
multiple_logistic summary(multiple_logistic)
##
## Call:
## glm(formula = churn ~ loyalty_years + premium_plan_ind + vehicles_covered +
## recent_rate_increase, family = "binomial", data = train_a)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1233 -0.6288 -0.5085 -0.3950 2.4893
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.605278 0.167308 -3.618 0.000297 ***
## loyalty_years -0.075880 0.009149 -8.294 < 2e-16 ***
## premium_plan_ind -0.423537 0.189851 -2.231 0.025688 *
## vehicles_covered -0.232327 0.069808 -3.328 0.000875 ***
## recent_rate_increase 0.709010 0.130197 5.446 5.16e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2621.8 on 3010 degrees of freedom
## Residual deviance: 2508.8 on 3006 degrees of freedom
## AIC: 2518.8
##
## Number of Fisher Scoring iterations: 4
varImp(multiple_logistic) # which explanatory variable (X) is the most influential in predicting the target / dependent variable (Y)
## Overall
## loyalty_years 8.293842
## premium_plan_ind 2.230892
## vehicles_covered 3.328068
## recent_rate_increase 5.445659
<- tibble(premium_plan_ind = 0, vehicles_covered = 1, recent_rate_increase=1, loyalty_years = c(1, 10))
years_prueba predict(multiple_logistic, years_prueba, type = "response")
## 1 2
## 0.4490587 0.2916477
La probabilidad en el segundo caso disminuye del 44% al 29$ de churn.
<- tibble(premium_plan_ind = 0, vehicles_covered = 1, recent_rate_increase=1, loyalty_years = c(11, 22))
years_prueba_2 predict(multiple_logistic, years_prueba_2, type = "response")
## 1 2
## 0.2762229 0.1421001
La probabilidad de churn disminuye del 27.6% al 14.2%
Esta negativamente relacionada, es decir, mientras más aumente los años de lealtad menos será la probabilidad de churn.
Matriz de confusión
<- predict(multiple_logistic, newdata = test_a, type = "response")
test_predictions
<- ifelse(test_predictions > 0.5, 1, 0)
predicted_classes
<- confusionMatrix(as.factor(predicted_classes), as.factor(test_a$churn))
conf_ml print(conf_ml)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1709 280
## 1 0 0
##
## Accuracy : 0.8592
## 95% CI : (0.8432, 0.8742)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.5159
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8592
## Neg Pred Value : NaN
## Prevalence : 0.8592
## Detection Rate : 0.8592
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
ROE CURVE
<- roc(as.numeric(as.character(test_a$churn)), test_predictions) roc_obj
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj)
AUC
# AUC - Multiple Logistic Regression Model
<- roc_obj$auc
auc_ml auc_ml
## Area under the curve: 0.6223
<- svm(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, data = train_b,type = "C-classification",cost = 1, kernel = "linear",scale = FALSE) svm_model_1
<- predict(svm_model_1, newdata = test_b)
svm_predictions
# Generar y mostrar la matriz de confusión
<- confusionMatrix(svm_predictions, test_b$churn, positive = "Yes")
conf_svm
# Imprimir la matriz de confusión y las métricas
print(conf_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1709 280
## Yes 0 0
##
## Accuracy : 0.8592
## 95% CI : (0.8432, 0.8742)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.5159
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8592
## Prevalence : 0.1408
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
<- naiveBayes(churn ~ loyalty_years + premium_plan_ind + vehicles_covered + recent_rate_increase, data = train_a) naive_bayes_model
<- predict(naive_bayes_model, newdata = test_a)
naive_bayes_predictions
<- confusionMatrix(as.factor(naive_bayes_predictions), as.factor(test_a$churn))
conf_nb
print(conf_nb)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1635 245
## 1 74 35
##
## Accuracy : 0.8396
## 95% CI : (0.8227, 0.8555)
## No Information Rate : 0.8592
## P-Value [Acc > NIR] : 0.9939
##
## Kappa : 0.1097
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9567
## Specificity : 0.1250
## Pos Pred Value : 0.8697
## Neg Pred Value : 0.3211
## Prevalence : 0.8592
## Detection Rate : 0.8220
## Detection Prevalence : 0.9452
## Balanced Accuracy : 0.5408
##
## 'Positive' Class : 0
##
<- predict(naive_bayes_model, newdata = test_a, type = "raw")[,2]
naive_bayes_prob
# Crear el objeto ROC
<- roc(response = test_a$churn, predictor = naive_bayes_prob) roc_nb
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Trazar la curva ROC
plot(roc_nb, main="Curva ROC para el Modelo de Naive Bayes", col="#1c61b6")
=roc_nb$auc
auc_nb auc_nb
## Area under the curve: 0.6055
#SL
<- conf_sl$byClass["Sensitivity"]
sensitivity_sl <- conf_sl$overall["Accuracy"]
accuracy_sl <- conf_sl$byClass["Positive Predictive Value"]
precision_sl <- conf_sl$overall["Kappa"]
kappa_sl <- 2 * (precision_sl * sensitivity_sl) / (precision_sl + sensitivity_sl)
f1_score_sl
auc_sl
## Area under the curve: 0.6049
#ML
<- conf_ml$byClass["Sensitivity"]
sensitivity_ml <- conf_ml$overall["Accuracy"]
accuracy_ml <- conf_ml$overall["Kappa"]
kappa_ml <- conf_ml$byClass["Positive Predictive Value"]
precision_ml <- 2 * (precision_ml * sensitivity_ml) / (precision_ml + sensitivity_ml)
f1_score_ml auc_ml
## Area under the curve: 0.6223
#NB
<- conf_nb$byClass["Sensitivity"]
sensitivity_nb <- conf_nb$overall["Accuracy"]
accuracy_nb <- conf_nb$overall["Kappa"]
kappa_nb <- conf_nb$byClass["Positive Predictive Value"]
precision_nb <- 2 * (precision_nb * sensitivity_nb) / (precision_nb + sensitivity_nb)
f1_score_nb auc_nb
## Area under the curve: 0.6055
#svm
<- conf_svm$byClass["Sensitivity"]
sensitivity_svm <- conf_svm$overall["Accuracy"]
accuracy_svm <- conf_svm$overall["Kappa"]
kappa_svm <- conf_svm$byClass["Positive Predictive Value"]
precision_svm <- 2 * (precision_svm * sensitivity_svm) / (precision_svm + sensitivity_svm) f1_score_svm
# Crear un data.frame para organizar las métricas de los modelos
<- data.frame(
metrics_table Model = c("Simple Logistic", "Multiple Logistic", "Naive Bayes", "SVM"),
Sensitivity = c(sensitivity_sl, sensitivity_ml, sensitivity_nb, sensitivity_svm),
Accuracy = c(accuracy_sl, accuracy_ml, accuracy_nb, accuracy_svm),
Kappa = c(kappa_sl, kappa_ml, kappa_nb, kappa_svm),
Precision = c(precision_sl, precision_ml, precision_nb, precision_svm),
F1_Score = c(f1_score_sl, f1_score_ml, f1_score_nb, f1_score_svm),
AUC = c(auc_sl, auc_ml, auc_nb, NA) # Reemplaza NA con auc_svm si está disponible
)
# Imprimir la tabla
print(metrics_table)
## Model Sensitivity Accuracy Kappa Precision F1_Score
## 1 Simple Logistic 1.0000000 0.8592257 0.0000000 NA NA
## 2 Multiple Logistic 1.0000000 0.8592257 0.0000000 NA NA
## 3 Naive Bayes 0.9566998 0.8396179 0.1097125 NA NA
## 4 SVM 0.0000000 0.8592257 0.0000000 NA NA
## AUC
## 1 0.6049183
## 2 0.6222739
## 3 0.6054972
## 4 NA
Como podemos apreciar el modelo de Multiple logistic es el modelo más apropiado para clasificar nuestros datos, ya que el AUC es mayor y la sensibilidad es la más alta, lo cual es muy importante ya que los falsos negativos son sumamente críticos en este caso de estudio, además, este modelo es mucho más fácil de interpretar que los demás modelos. Por otro lado, el Accuracy de Multiple Logistic es el más alto de todos los modelos junto con Simple Logistic.