Datos iniciales
df_cardio <-
read_csv("heart_failure_clinical_records_dataset.csv") %>%
mutate(across(
c(anaemia, diabetes, high_blood_pressure, sex, smoking, DEATH_EVENT),
as.factor
))
df_cardio
Exploratorio
Valores ausentes
df_cardio %>%
vis_miss()

ĀæBalance de clases?
df_cardio %>%
count(DEATH_EVENT) %>%
ggplot(aes(x = DEATH_EVENT, y = n)) +
geom_col() +
geom_label(aes(label = n))

Distribuciones numƩricas
df_cardio %>%
select(DEATH_EVENT, where(is.numeric)) %>%
pivot_longer(cols = -DEATH_EVENT) %>%
ggplot(aes(x = value, fill = DEATH_EVENT)) +
facet_wrap(~name, scales = "free") +
geom_density(alpha = 0.5)

- Transformación logarĆtmica:
df_cardio %>%
select(DEATH_EVENT, where(is.numeric)) %>%
pivot_longer(cols = -DEATH_EVENT) %>%
ggplot(aes(x = value, fill = DEATH_EVENT)) +
facet_wrap(~name, scales = "free") +
geom_density(alpha = 0.5) +
scale_x_log10()

Distribuciones categóricas
- Opción 1: frecuencias absolutas
df_cardio %>%
select(where(is.factor)) %>%
pivot_longer(cols = -DEATH_EVENT) %>%
count(name, value, DEATH_EVENT) %>%
ggplot(aes(x = value, y = n, fill = DEATH_EVENT)) +
facet_wrap(~name, scales = "free") +
geom_col(position = "dodge")

- Opción 1: frecuencias relativas
df_cardio %>%
select(where(is.factor)) %>%
pivot_longer(cols = -DEATH_EVENT) %>%
count(name, value, DEATH_EVENT) %>%
ggplot(aes(x = value, y = n, fill = DEATH_EVENT)) +
facet_wrap(~name, scales = "free") +
geom_col(position = "fill")

Dispersiones
- Age vs serum_creatinine: escala original
df_cardio %>%
ggplot(aes(x = age, y = serum_creatinine , color = DEATH_EVENT)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)

- Age vs serum_creatinine: escala logarĆtmica de Y
df_cardio %>%
ggplot(aes(x = age, y = serum_creatinine , color = DEATH_EVENT)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
scale_y_log10()

PCA (CP1 VS CP2)
datos_pca <- df_cardio
acp <- PCA(X = datos_pca %>% select(where(is.numeric)), graph = FALSE)
datos_pca$cp1 <- acp$ind$coord[, 1]
datos_pca$cp2 <- acp$ind$coord[, 2]
datos_pca %>%
ggplot(aes(x = cp1, y = cp2, color = DEATH_EVENT)) +
geom_hline(yintercept = 0, lty = 2, color = "black") +
geom_vline(xintercept = 0, lty = 2, color = "black") +
geom_point()

Train - Test
set.seed(2022)
particion_inicial_c <- initial_split(data = df_cardio, prop = 0.8, strata = DEATH_EVENT)
train_c <- training(particion_inicial_c)
test_c <- testing(particion_inicial_c)
Validación cruzada (k-fold)
set.seed(2022)
submuestras_kfold_c <- vfold_cv(data = train_c, v = 5, repeats = 1, strata = DEATH_EVENT)
Modelos
# Modelo de regresión logĆstica
mod_logi <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
# Modelo lineal regularizado: Ridge
mod_lasso_c <- logistic_reg(mixture = 1, penalty = 0.5) %>%
set_mode("classification") %>%
set_engine("glmnet")
# Modelo lineal regularizado: Ridge
mod_ridge_c <- logistic_reg(mixture = 0, penalty = 0.5) %>%
set_mode("classification") %>%
set_engine("glmnet")
# Modelo lineal regularizado: ElasticNet
mod_enet_c <- logistic_reg(mixture = 0.5, penalty = 0.5) %>%
set_mode("classification") %>%
set_engine("glmnet")
Recetas de preprocesamiento
# Receta bƔsica
rec_basica_c <- recipe(DEATH_EVENT ~ ., data = train_c) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
# Receta con trasnformaciones y balanceo de clases ascendente
rec_trasnformer1_c <- recipe(DEATH_EVENT ~ ., data = train_c) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_upsample(DEATH_EVENT)
# Receta con trasnformaciones y balanceo de clases descendente
rec_trasnformer2_c <- recipe(DEATH_EVENT ~ ., data = train_c) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_downsample(DEATH_EVENT)
Ajuste de modelos
R. LogĆstica
logi_basica <- fit_resamples(mod_logi, rec_basica_c, submuestras_kfold_c)
logi_basica %>% collect_metrics()
- Receta con muestreo ascendente:
logi_asc <- fit_resamples(mod_logi, rec_trasnformer1_c, submuestras_kfold_c)
logi_asc %>% collect_metrics()
- Receta con muestreo descendente:
logi_desc <- fit_resamples(mod_logi, rec_trasnformer2_c, submuestras_kfold_c)
logi_desc %>% collect_metrics()
R. LogĆstica - Lasso
lasso_basica <- fit_resamples(mod_lasso_c, rec_basica_c, submuestras_kfold_c)
lasso_basica %>% collect_metrics()
- Receta con muestreo ascendente:
lasso_asc <- fit_resamples(mod_lasso_c, rec_trasnformer1_c, submuestras_kfold_c)
lasso_asc %>% collect_metrics()
- Receta con muestreo descendente:
lasso_desc <- fit_resamples(mod_lasso_c, rec_trasnformer2_c, submuestras_kfold_c)
lasso_desc %>% collect_metrics()
R. LogĆstica - Ridge
ridge_basica <- fit_resamples(mod_ridge_c, rec_basica_c, submuestras_kfold_c)
ridge_basica %>% collect_metrics()
- Receta con muestreo ascendente:
ridge_asc <- fit_resamples(mod_ridge_c, rec_trasnformer1_c, submuestras_kfold_c)
ridge_asc %>% collect_metrics()
- Receta con muestreo descendente:
ridge_desc <- fit_resamples(mod_ridge_c, rec_trasnformer2_c, submuestras_kfold_c)
ridge_desc %>% collect_metrics()
R. LogĆstica - ElasticNet
enet_basica <- fit_resamples(mod_enet_c, rec_basica_c, submuestras_kfold_c)
enet_basica %>% collect_metrics()
- Receta con muestreo ascendente:
enet_asc <- fit_resamples(mod_enet_c, rec_trasnformer1_c, submuestras_kfold_c)
enet_asc %>% collect_metrics()
- Receta con muestreo descendente:
enet_desc <- fit_resamples(mod_enet_c, rec_trasnformer2_c, submuestras_kfold_c)
enet_desc %>% collect_metrics()
Comparando modelos
Tabla
df_glm_basica <-
logi_basica %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM - BƔsica")
df_glm_asc <-
logi_asc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM - Ascendente")
df_glm_desc <-
logi_desc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM - Descendente")
df_lasso_basic <-
lasso_basica %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM Lasso - BƔsica")
df_lasso_asc <-
lasso_asc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM Lasso - Ascendente")
df_lasso_desc <-
lasso_desc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM Lasso - Descendente")
df_ridge_basic <-
ridge_basica %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM Ridge - BƔsica")
df_ridge_asc <-
ridge_asc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM Ridge - Ascendente")
df_ridge_desc <-
ridge_desc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM Ridge - Descendente")
df_enet_basic <-
enet_basica %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM ElasticNet - BƔsica")
df_enet_asc <-
enet_asc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM ElasticNet - Ascendente")
df_enet_desc <-
enet_desc %>%
collect_metrics() %>%
select(.metric, mean) %>%
mutate(modelo = "GLM ElasticNet - Descendente")
df_resultados_c <- bind_rows(
df_glm_basica, df_glm_asc, df_glm_desc,
df_lasso_basic, df_lasso_asc, df_lasso_desc,
df_ridge_basic, df_ridge_asc, df_ridge_desc,
df_enet_basic, df_enet_asc, df_enet_desc
)
df_resultados_c
GrƔfico
df_resultados_c %>%
ggplot(aes(x = modelo, y = mean)) +
facet_wrap(~.metric, scales = "free") +
geom_point() +
coord_flip()

Modelo final: Ridge
Ascendente
Ajuste
# Datos preprocesados para train
datos_preprocesados_train_c <- rec_trasnformer1_c %>%
prep() %>%
bake(new_data = train_c)
final_ridge_c <- mod_ridge_c %>%
fit(DEATH_EVENT ~ ., data = datos_preprocesados_train_c)
Predicción en test
- Predicción de clases o etiquetas:
# Datos preprocesados para test
datos_preprocesados_test_c <- rec_trasnformer1_c %>%
prep() %>%
bake(new_data = test_c)
predichos_test_c <- predict(final_ridge_c, new_data = datos_preprocesados_test_c)
predichos_test_c
- Predicción de probabilidades:
predichos_test_c_prob <-
predict(final_ridge_c, new_data = datos_preprocesados_test_c, type = "prob")
predichos_test_c_prob
Error en test
resultados_test <-
data.frame(reales = test_c$DEATH_EVENT,
predichos = predichos_test_c$.pred_class)
resultados_test %>%
conf_mat(truth = reales, estimate = predichos)
## Truth
## Prediction 0 1
## 0 41 18
## 1 0 2
- GrÔfico de matriz de confusión:
resultados_test %>%
conf_mat(truth = reales, estimate = predichos) %>%
autoplot()

- Curva ROC (para la etiqueta 0):
df_roc <- data.frame(reales = test_c$DEATH_EVENT,
predichos_prob0 = predichos_test_c_prob$.pred_0)
df_roc %>%
roc_curve(truth = reales, estimate = predichos_prob0, event_level = "first") %>%
autoplot()

- Ćrea bajo la curva ROC (para la etiqueta 0):
df_roc %>%
roc_auc(truth = reales, estimate = predichos_prob0, event_level = "first")
- Ćrea bajo la curva ROC (para la etiqueta 1):
df_roc %>%
roc_auc(truth = reales, estimate = predichos_prob0, event_level = "second")
Importancia de variables
final_ridge_c %>%
vi() %>%
slice(1:20) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
