# =============================================
# 1. Cargar y preparar los datos
# =============================================
# Cargar base de datos
data1 <- read.csv("bank.csv")
# Verificar estructura
str(data1)## 'data.frame': 41188 obs. of 20 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr "housemaid" "services" "services" "admin." ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education : chr "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr "no" "unknown" "no" "no" ...
## $ housing : chr "no" "no" "yes" "no" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : chr "no" "no" "no" "no" ...
#=============================================================
# PREPARACIÓN DE VARIABLES
#=============================================================
# Variable respuesta:
# no = No suscribió depósito
# yes = Sí suscribió depósito
data1$y <- factor(data1$y, levels = c("no", "yes"), labels = c("No", "Sí"))
#=============================================================
# VARIABLES CATEGÓRICAS
#=============================================================
# Variables categóricas como factores
data1$job <- as.factor(data1$job)
data1$marital <- as.factor(data1$marital)
data1$education <- as.factor(data1$education)
data1$contact <- as.factor(data1$contact)
data1$poutcome <- as.factor(data1$poutcome)
#=============================================================
# VERIFICAR ESTRUCTURA
#=============================================================
str(data1)## 'data.frame': 41188 obs. of 20 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
## $ default : chr "no" "unknown" "no" "no" ...
## $ housing : chr "no" "no" "yes" "no" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
#=============================================================
# VISUALIZACIÓN DE LOS DATOS
#=============================================================
#-------------------------------------------------------------
# Gráfico 1: Edad vs Euribor según suscripción
#-------------------------------------------------------------
ggplot(data1,
aes(x = age,
y = euribor3m,
color = y)) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c("red", "darkgreen"),
labels = c("No suscribió",
"Sí suscribió"),
name = "Suscripción") +
labs(title = "Edad y tasa Euribor según suscripción",
x = "Edad",
y = "Euribor 3 meses") +
theme_minimal(base_size = 14)#-------------------------------------------------------------
# Gráfico 2: Número de contactos vs contactos previos
#-------------------------------------------------------------
ggplot(data1,
aes(x = campaign,
y = previous,
color = y)) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c("red", "darkgreen"),
labels = c("No suscribió",
"Sí suscribió"),
name = "Suscripción") +
labs(title = "Campaña actual vs contactos previos",
x = "Número de contactos actuales",
y = "Contactos previos") +
theme_minimal(base_size = 14)#=============================================================
# Calcular odds de suscripción
#=============================================================
# Tabla de proporciones
prop1 <- prop.table(table(data1$y))
prop1##
## No Sí
## 0.8873458 0.1126542
## Sí
## 0.1269563
La regresión logística es un modelo estadístico utilizado para problemas de clasificación binaria, ya que permite predecir la probabilidad de ocurrencia de un evento, como la respuesta “sí” o “no” en la suscripción de depósitos a plazo. Una de sus principales ventajas es su alta interpretabilidad mediante los Odds Ratios, los cuales facilitan comprender el efecto de cada variable sobre la probabilidad de suscripción. Además, este modelo permite identificar cuáles variables son estadísticamente significativas y relevantes en la toma de decisiones.
modelo_logit <- glm(y ~ age + job + marital + education + campaign + previous + euribor3m + emp.var.rate + cons.conf.idx +
contact + poutcome, data = data1, family = binomial)##
## Call:
## glm(formula = y ~ age + job + marital + education + campaign +
## previous + euribor3m + emp.var.rate + cons.conf.idx + contact +
## poutcome, family = binomial, data = data1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.953600 0.301615 3.162 0.00157 **
## age 0.002091 0.002067 1.011 0.31181
## jobblue-collar -0.298109 0.067228 -4.434 9.24e-06 ***
## jobentrepreneur -0.138288 0.105823 -1.307 0.19129
## jobhousemaid -0.074137 0.124635 -0.595 0.55196
## jobmanagement -0.091699 0.073249 -1.252 0.21062
## jobretired 0.360589 0.091318 3.949 7.86e-05 ***
## jobself-employed -0.071970 0.099047 -0.727 0.46746
## jobservices -0.211645 0.073499 -2.880 0.00398 **
## jobstudent 0.331437 0.097435 3.402 0.00067 ***
## jobtechnician -0.059750 0.060277 -0.991 0.32156
## jobunemployed 0.040707 0.108305 0.376 0.70702
## jobunknown -0.152713 0.204112 -0.748 0.45435
## maritalmarried 0.023783 0.058584 0.406 0.68477
## maritalsingle 0.133575 0.066441 2.010 0.04439 *
## maritalunknown 0.236734 0.356850 0.663 0.50707
## educationbasic.6y 0.050527 0.101821 0.496 0.61973
## educationbasic.9y -0.066986 0.080549 -0.832 0.40563
## educationhigh.school 0.010892 0.078026 0.140 0.88898
## educationilliterate 0.821833 0.667323 1.232 0.21812
## educationprofessional.course 0.076475 0.086009 0.889 0.37392
## educationuniversity.degree 0.143134 0.077861 1.838 0.06601 .
## educationunknown 0.183701 0.102515 1.792 0.07314 .
## campaign -0.053943 0.009444 -5.712 1.12e-08 ***
## previous 0.245251 0.052435 4.677 2.91e-06 ***
## euribor3m -0.584539 0.038520 -15.175 < 2e-16 ***
## emp.var.rate 0.161650 0.041020 3.941 8.12e-05 ***
## cons.conf.idx 0.046058 0.003571 12.897 < 2e-16 ***
## contacttelephone -0.430288 0.047101 -9.135 < 2e-16 ***
## poutcomenonexistent 0.741229 0.084133 8.810 < 2e-16 ***
## poutcomesuccess 1.909005 0.075996 25.120 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 23672 on 41157 degrees of freedom
## AIC: 23734
##
## Number of Fisher Scoring iterations: 6
#=============================================================
# ODDS RATIOS
#=============================================================
odds_ratios <- exp(coef(modelo_logit))
odds_ratios## (Intercept) age
## 2.5950355 1.0020931
## jobblue-collar jobentrepreneur
## 0.7422207 0.8708481
## jobhousemaid jobmanagement
## 0.9285449 0.9123801
## jobretired jobself-employed
## 1.4341738 0.9305592
## jobservices jobstudent
## 0.8092517 1.3929680
## jobtechnician jobunemployed
## 0.9420000 1.0415474
## jobunknown maritalmarried
## 0.8583758 1.0240678
## maritalsingle maritalunknown
## 1.1429069 1.2671044
## educationbasic.6y educationbasic.9y
## 1.0518252 0.9352085
## educationhigh.school educationilliterate
## 1.0109514 2.2746658
## educationprofessional.course educationuniversity.degree
## 1.0794753 1.1538843
## educationunknown campaign
## 1.2016570 0.9474858
## previous euribor3m
## 1.2779420 0.5573625
## emp.var.rate cons.conf.idx
## 1.1754488 1.0471349
## contacttelephone poutcomenonexistent
## 0.6503220 2.0985125
## poutcomesuccess
## 6.7463727
#=============================================================
# INTERVALOS DE CONFIANZA
#=============================================================
exp(confint(modelo_logit))## 2.5 % 97.5 %
## (Intercept) 1.4362099 4.6851169
## age 0.9980363 1.0061574
## jobblue-collar 0.6504480 0.8465873
## jobentrepreneur 0.7048930 1.0675931
## jobhousemaid 0.7239646 1.1804165
## jobmanagement 0.7894163 1.0520499
## jobretired 1.1984085 1.7142717
## jobself-employed 0.7639275 1.1265938
## jobservices 0.6999212 0.9336978
## jobstudent 1.1495888 1.6844221
## jobtechnician 0.8367115 1.0597466
## jobunemployed 0.8396952 1.2840776
## jobunknown 0.5674800 1.2649669
## maritalmarried 0.9138111 1.1497623
## maritalsingle 1.0040309 1.3027906
## maritalunknown 0.6013633 2.4576732
## educationbasic.6y 0.8602793 1.2825060
## educationbasic.9y 0.7988776 1.0955659
## educationhigh.school 0.8680799 1.1787175
## educationilliterate 0.5317163 7.6155211
## educationprofessional.course 0.9122043 1.2780134
## educationuniversity.degree 0.9912318 1.3450605
## educationunknown 0.9819735 1.4678087
## campaign 0.9297827 0.9648405
## previous 1.1536887 1.4170674
## euribor3m 0.5168264 0.6010713
## emp.var.rate 1.0846561 1.2738836
## cons.conf.idx 1.0398305 1.0544899
## contacttelephone 0.5927530 0.7129650
## poutcomenonexistent 1.7815799 2.4777762
## poutcomesuccess 5.8160270 7.8346308
#=============================================================
# PREDICCIONES
#=============================================================
probabilidades <- predict(modelo_logit, type = "response")
head(probabilidades)## 1 2 3 4 5 6
## 0.04476810 0.03973472 0.03816944 0.04883455 0.03965501 0.03598679
#=============================================================
# CLASIFICACIÓN
#=============================================================
predicciones <- ifelse(probabilidades > 0.5, "Sí", "No")
predicciones <- factor(predicciones, levels = c("No", "Sí"))
#=============================================================
# VARIABLES SIGNIFICATIVAS
#=============================================================
summary(modelo_logit)$coefficients## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.953600183 0.301614677 3.1616505 1.568777e-03
## age 0.002090947 0.002067310 1.0114336 3.118089e-01
## jobblue-collar -0.298108662 0.067228435 -4.4342645 9.238716e-06
## jobentrepreneur -0.138287704 0.105823177 -1.3067809 1.912871e-01
## jobhousemaid -0.074136504 0.124635396 -0.5948270 5.519591e-01
## jobmanagement -0.091698638 0.073249470 -1.2518676 2.106181e-01
## jobretired 0.360588901 0.091318144 3.9487103 7.857336e-05
## jobself-employed -0.071969622 0.099046978 -0.7266211 4.674581e-01
## jobservices -0.211645344 0.073499109 -2.8795634 3.982262e-03
## jobstudent 0.331436728 0.097435035 3.4016176 6.698830e-04
## jobtechnician -0.059749968 0.060277302 -0.9912515 3.215628e-01
## jobunemployed 0.040707484 0.108304765 0.3758605 7.070206e-01
## jobunknown -0.152713254 0.204111901 -0.7481840 4.543492e-01
## maritalmarried 0.023782704 0.058584286 0.4059571 6.847742e-01
## maritalsingle 0.133574896 0.066441258 2.0104209 4.438666e-02
## maritalunknown 0.236734286 0.356850059 0.6633999 5.070745e-01
## educationbasic.6y 0.050526899 0.101821084 0.4962322 6.197306e-01
## educationbasic.9y -0.066985765 0.080548818 -0.8316170 4.056252e-01
## educationhigh.school 0.010891864 0.078025661 0.1395934 8.889813e-01
## educationilliterate 0.821833123 0.667322622 1.2315379 2.181217e-01
## educationprofessional.course 0.076475110 0.086009074 0.8891517 3.739216e-01
## educationuniversity.degree 0.143133903 0.077860672 1.8383338 6.601324e-02
## educationunknown 0.183701429 0.102515422 1.7919395 7.314267e-02
## campaign -0.053943306 0.009444151 -5.7118214 1.117734e-08
## previous 0.245250954 0.052434748 4.6772601 2.907334e-06
## euribor3m -0.584539363 0.038520029 -15.1749461 5.182029e-52
## emp.var.rate 0.161650025 0.041020134 3.9407484 8.122780e-05
## cons.conf.idx 0.046057767 0.003571149 12.8971852 4.668399e-38
## contacttelephone -0.430287694 0.047100836 -9.1354578 6.513063e-20
## poutcomenonexistent 0.741228767 0.084132707 8.8102332 1.248858e-18
## poutcomesuccess 1.909004983 0.075995667 25.1199188 3.013605e-139
#=============================================================
# IMPORTANCIA ESTADÍSTICA
#=============================================================
# Variables con p-value < 0.05
coeficientes <- summary(modelo_logit)$coefficients
variables_sig <- coeficientes[coeficientes[,4] < 0.05, ]
variables_sig## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.95360018 0.301614677 3.161650 1.568777e-03
## jobblue-collar -0.29810866 0.067228435 -4.434264 9.238716e-06
## jobretired 0.36058890 0.091318144 3.948710 7.857336e-05
## jobservices -0.21164534 0.073499109 -2.879563 3.982262e-03
## jobstudent 0.33143673 0.097435035 3.401618 6.698830e-04
## maritalsingle 0.13357490 0.066441258 2.010421 4.438666e-02
## campaign -0.05394331 0.009444151 -5.711821 1.117734e-08
## previous 0.24525095 0.052434748 4.677260 2.907334e-06
## euribor3m -0.58453936 0.038520029 -15.174946 5.182029e-52
## emp.var.rate 0.16165003 0.041020134 3.940748 8.122780e-05
## cons.conf.idx 0.04605777 0.003571149 12.897185 4.668399e-38
## contacttelephone -0.43028769 0.047100836 -9.135458 6.513063e-20
## poutcomenonexistent 0.74122877 0.084132707 8.810233 1.248858e-18
## poutcomesuccess 1.90900498 0.075995667 25.119919 3.013605e-139
Random Forest es un método basado en múltiples árboles de decisión que permite reducir el sobreajuste y mejorar la estabilidad del modelo predictivo. Además, tiene la capacidad de manejar relaciones no lineales entre las variables, lo que lo hace adecuado para problemas complejos de clasificación. En este estudio, este modelo obtuvo el mejor desempeño en términos de Recall y F1-score, demostrando una alta capacidad para identificar correctamente los clientes que suscriben depósitos a plazo.
#=============================================================
# 1. PREPARACIÓN DE LOS DATOS
#=============================================================
# Verificar estructura de la variable respuesta
table(data1$y)##
## No Sí
## 36548 4640
# Asegurar que la variable respuesta sea factor
data1$y <- factor(data1$y, levels = c("No", "Sí"))
# Verificar proporción de clases
prop.table(table(data1$y))##
## No Sí
## 0.8873458 0.1126542
#=============================================================
# 2. DIVISIÓN ENTRE ENTRENAMIENTO Y PRUEBA
#=============================================================
set.seed(123)
indice_train <- createDataPartition(data1$y, p = 0.70, list = FALSE)
train <- data1[indice_train, ]
test <- data1[-indice_train, ]
# Verificar distribución de clases en train y test
prop.table(table(train$y))##
## No Sí
## 0.8873474 0.1126526
##
## No Sí
## 0.8873422 0.1126578
#=============================================================
# 3. AJUSTE DEL MODELO RANDOM FOREST
#=============================================================
set.seed(123)
modelo_rf <- randomForest(y ~ age + job + marital + education +
campaign + previous + euribor3m +
emp.var.rate + cons.conf.idx +
contact + poutcome,
data = train,
ntree = 500,
importance = TRUE)
# Ver resumen del modelo
modelo_rf##
## Call:
## randomForest(formula = y ~ age + job + marital + education + campaign + previous + euribor3m + emp.var.rate + cons.conf.idx + contact + poutcome, data = train, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 10.37%
## Confusion matrix:
## No Sí class.error
## No 24938 646 0.02525016
## Sí 2345 903 0.72198276
#=============================================================
# 4. PREDICCIONES SOBRE DATOS DE PRUEBA
#=============================================================
# Predicción de clases
pred_rf <- predict(modelo_rf,
newdata = test,
type = "class")
# Predicción de probabilidades
prob_rf <- predict(modelo_rf,
newdata = test,
type = "prob")
head(prob_rf)## No Sí
## 6 0.998 0.002
## 11 1.000 0.000
## 12 0.992 0.008
## 13 0.986 0.014
## 14 1.000 0.000
## 15 1.000 0.000
#=============================================================
# 5. MATRIZ DE CONFUSIÓN
#=============================================================
confusionMatrix(pred_rf, test$y, positive = "Sí")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Sí
## No 10700 1014
## Sí 264 378
##
## Accuracy : 0.8966
## 95% CI : (0.8911, 0.9019)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 0.0005464
##
## Kappa : 0.3236
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.27155
## Specificity : 0.97592
## Pos Pred Value : 0.58879
## Neg Pred Value : 0.91344
## Prevalence : 0.11266
## Detection Rate : 0.03059
## Detection Prevalence : 0.05196
## Balanced Accuracy : 0.62374
##
## 'Positive' Class : Sí
##
#=============================================================
# 6. CURVA ROC Y AUC
#=============================================================
roc_rf <- roc(test$y, prob_rf[, "Sí"])
plot(roc_rf,
main = "Curva ROC - Random Forest")## Area under the curve: 0.7822
#=============================================================
# 7. IMPORTANCIA DE VARIABLES
#=============================================================
importance(modelo_rf)## No Sí MeanDecreaseAccuracy MeanDecreaseGini
## age 37.23384 2.01138176 38.09027 722.54987
## job 56.63689 -3.66747858 53.96796 430.92688
## marital 21.17053 0.05058843 19.82834 160.39266
## education 40.75661 -6.25807827 37.23688 320.64093
## campaign 10.05754 15.03917233 16.22441 334.36962
## previous 10.00531 6.55888450 11.12040 126.01022
## euribor3m 38.65772 6.11171500 42.57890 1026.02555
## emp.var.rate 39.16514 7.40641249 41.16704 247.77328
## cons.conf.idx 37.09537 -14.71890449 38.20570 324.99007
## contact 19.47897 31.99469292 26.59062 74.43644
## poutcome 17.19731 22.72906859 34.29004 323.55840
#=============================================================
# 8. TABLA ORDENADA DE IMPORTANCIA DE VARIABLES
#=============================================================
importancia_rf <- importance(modelo_rf)
importancia_rf_df <- data.frame(
Variable = rownames(importancia_rf),
MeanDecreaseGini = importancia_rf[, "MeanDecreaseGini"]
)
importancia_rf_df <- importancia_rf_df %>%
arrange(desc(MeanDecreaseGini))
importancia_rf_df## Variable MeanDecreaseGini
## euribor3m euribor3m 1026.02555
## age age 722.54987
## job job 430.92688
## campaign campaign 334.36962
## cons.conf.idx cons.conf.idx 324.99007
## poutcome poutcome 323.55840
## education education 320.64093
## emp.var.rate emp.var.rate 247.77328
## marital marital 160.39266
## previous previous 126.01022
## contact contact 74.43644
XGBoost es una técnica avanzada de boosting que construye árboles de decisión de manera secuencial, donde cada nuevo árbol busca corregir los errores de predicción del anterior. Este enfoque permite optimizar el desempeño predictivo y mejorar la precisión del modelo en problemas complejos de clasificación. En este estudio, XGBoost obtuvo el mayor valor de AUC, evidenciando una excelente capacidad para diferenciar entre clientes que suscriben y no suscriben depósitos a plazo.
#=============================================================
# 1. PREPARACIÓN DE LA VARIABLE RESPUESTA
#=============================================================
# Asegurar que y sea factor con niveles claros
data1$y <- factor(data1$y, levels = c("No", "Sí"))
# Crear variable numérica para XGBoost:
# No = 0, Sí = 1
data1$y_num <- ifelse(data1$y == "Sí", 1, 0)
# Esto significa: 0 = No suscribió el depósito.
# 1 = Sí suscribió el depósito.
# Verificar proporción de clases
prop.table(table(data1$y))##
## No Sí
## 0.8873458 0.1126542
#=============================================================
# 2. SELECCIÓN DE VARIABLES
#=============================================================
variables_modelo <- data1 %>%
select(age, job, marital, education,
campaign, previous, euribor3m,
emp.var.rate, cons.conf.idx,
contact, poutcome, y, y_num)
#=============================================================
# 3. DIVISIÓN TRAIN / TEST
#=============================================================
set.seed(123)
indice_train <- createDataPartition(variables_modelo$y,
p = 0.70,
list = FALSE)
train <- variables_modelo[indice_train, ]
test <- variables_modelo[-indice_train, ]
#=============================================================
# 4. MATRICES PARA XGBOOST
#=============================================================
# XGBoost requiere variables numéricas.
# model.matrix convierte variables categóricas en dummies.
x_train <- model.matrix(y_num ~ age + job + marital + education +
campaign + previous + euribor3m +
emp.var.rate + cons.conf.idx +
contact + poutcome,
data = train)[, -1]
x_test <- model.matrix(y_num ~ age + job + marital + education +
campaign + previous + euribor3m +
emp.var.rate + cons.conf.idx +
contact + poutcome,
data = test)[, -1]
y_train <- train$y_num
y_test <- test$y_num
# El comando: model.matrix()
# convierte variables como job, marital, education, contact y poutcome en variables dummy.
# Esto es necesario porque XGBoost no trabaja directamente con variables categóricas en formato texto.
# Crear objetos DMatrix, formato eficiente para XGBoost
dtrain <- xgb.DMatrix(data = x_train, label = y_train)
dtest <- xgb.DMatrix(data = x_test, label = y_test)
#=============================================================
# 5. AJUSTE DEL MODELO XGBOOST
#=============================================================
set.seed(123)
parametros <- list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = 4,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8
)
modelo_xgb <- xgb.train(
params = parametros,
data = dtrain,
nrounds = 100,
verbose = 0
)
# Ver modelo
modelo_xgb## ##### xgb.Booster
## call:
## xgb.train(params = parametros, data = dtrain, nrounds = 100,
## verbose = 0)
## # of features: 30
## # of rounds: 100
# El modelo usa boosting, es decir, construye árboles de decisión secuenciales. Cada nuevo árbol
# intenta corregir errores cometidos por los árboles anteriores.
# Parámetros principales:
# objective = "binary:logistic": indica que el problema es de clasificación binaria.
# nrounds = 100: número de árboles.
# max_depth = 4: profundidad máxima de cada árbol.
# eta = 0.1: tasa de aprendizaje.
# eval_metric = "auc": se evalúa el modelo usando AUC.
#=============================================================
# 6. PREDICCIÓN DE PROBABILIDADES
#=============================================================
prob_xgb <- predict(modelo_xgb, dtest)
head(prob_xgb)## [1] 0.02674267 0.03119629 0.04039941 0.03307610 0.02893238 0.03278977
#=============================================================
# 7. CLASIFICACIÓN
#=============================================================
# Punto de corte tradicional: 0.50
pred_xgb <- ifelse(prob_xgb > 0.5, "Sí", "No")
pred_xgb <- factor(pred_xgb, levels = c("No", "Sí"))
real_xgb <- factor(ifelse(y_test == 1, "Sí", "No"),
levels = c("No", "Sí"))
#=============================================================
# 8. MATRIZ DE CONFUSIÓN
#=============================================================
confusionMatrix(pred_xgb, real_xgb, positive = "Sí")## Confusion Matrix and Statistics
##
## Reference
## Prediction No Sí
## No 10807 1089
## Sí 157 303
##
## Accuracy : 0.8992
## 95% CI : (0.8937, 0.9044)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1.321e-05
##
## Kappa : 0.2873
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.21767
## Specificity : 0.98568
## Pos Pred Value : 0.65870
## Neg Pred Value : 0.90846
## Prevalence : 0.11266
## Detection Rate : 0.02452
## Detection Prevalence : 0.03723
## Balanced Accuracy : 0.60168
##
## 'Positive' Class : Sí
##
# El comando: confusionMatrix(pred_xgb, real_xgb, positive = "Sí")
# permite evaluar:
# Accuracy
# Sensitivity / Recall
# Specificity
# Precision
# F1#=============================================================
# 9. CURVA ROC Y AUC
#=============================================================
roc_xgb <- roc(real_xgb, prob_xgb)
plot(roc_xgb, main = "Curva ROC - XGBoost")## Area under the curve: 0.8082
# El comando: auc(roc_xgb)
# mide qué tan bien el modelo distingue entre clientes que suscriben y clientes que no suscriben.
# Interpretación general:
# AUC = 0.50: modelo sin capacidad discriminante.
# AUC entre 0.70 y 0.80: aceptable.
# AUC entre 0.80 y 0.90: bueno.
# AUC > 0.90: excelente.#=============================================================
# 10. IMPORTANCIA DE VARIABLES
#=============================================================
importancia_xgb <- xgb.importance(feature_names = colnames(x_train),
model = modelo_xgb)
importancia_xgb## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: euribor3m 0.4757682885 0.227718489 0.209388972
## 2: emp.var.rate 0.1772910850 0.083990800 0.078986587
## 3: cons.conf.idx 0.1186853302 0.125791016 0.106557377
## 4: poutcomesuccess 0.0750686097 0.061219492 0.027570790
## 5: age 0.0541578601 0.152254496 0.189269747
## 6: campaign 0.0272677278 0.105538451 0.108047690
## 7: contacttelephone 0.0183928768 0.037650601 0.037257824
## 8: previous 0.0102140463 0.036068637 0.040983607
## 9: jobblue-collar 0.0066223191 0.017871821 0.016393443
## 10: educationuniversity.degree 0.0046229651 0.012192239 0.019374069
## 11: educationhigh.school 0.0032954185 0.006755990 0.018628912
## 12: maritalsingle 0.0027883033 0.007388440 0.014157973
## 13: educationprofessional.course 0.0024672783 0.009339094 0.015648286
## 14: jobservices 0.0024175719 0.008505659 0.010432191
## 15: educationbasic.9y 0.0023837337 0.010927793 0.014157973
## 16: jobstudent 0.0019114000 0.015325485 0.007451565
## 17: maritalmarried 0.0018449063 0.001100182 0.011922504
## 18: jobretired 0.0018195885 0.005205042 0.005961252
## 19: jobunknown 0.0016129151 0.008111912 0.009687034
## 20: jobself-employed 0.0015935766 0.004434491 0.006706408
## 21: educationbasic.6y 0.0014952967 0.007911512 0.009687034
## 22: jobhousemaid 0.0014498414 0.014098272 0.007451565
## 23: educationunknown 0.0013960610 0.005772408 0.005961252
## 24: jobtechnician 0.0011307235 0.003428561 0.004470939
## 25: jobentrepreneur 0.0010707903 0.007009652 0.007451565
## 26: poutcomenonexistent 0.0010348289 0.005523203 0.003725782
## 27: jobunemployed 0.0008331934 0.005352573 0.005216095
## 28: maritalunknown 0.0007495475 0.011709473 0.004470939
## 29: jobmanagement 0.0006139165 0.001804215 0.002980626
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
# El comando: xgb.importance()
# muestra qué variables fueron más útiles para construir el modelo.
# En XGBoost, la métrica Gain indica cuánto aporta una variable a mejorar las divisiones de los árboles.
# Mientras mayor sea el Gain, más importante es la variable para el modelo.