Preprocesamiento

  • (recomendación) Orden de pasos de preprocesamiento:
      1. Imputación de datos
      1. Manipulación de nuevos niveles en variables categóricas
      1. Transformaciones individuales (logaritmos, raĆ­z cuadrada, Box-Cox, etc.)
      1. Discretización de variables cuantitativas
      1. Generación de variables dummy o indicadoras
      1. Generación de interacciones
      1. Centrado, escalado y normalización de variables
      1. Transformaciones multivariadas (APC, UMAP, T-sne)

Bibliotecas

library(tidyverse)
library(tidymodels)
library(visdat)
library(DataExplorer)
library(splines)
library(glmnet)
library(vip)
library(FactoMineR)
library(themis)

Ejemplo regresión

Datos iniciales

df_turistas <- read_csv("Train.csv") %>% 
  select(-country)
df_turistas %>% head()

Exploratorio

Valores ausentes

df_turistas %>% 
  select(-ID) %>% 
  vis_miss()

Distribuciones numƩricas

df_turistas %>% 
  plot_histogram(ncol = 3)

Distribuciones categóricas

df_turistas %>% 
  select(-ID) %>% 
  plot_bar()

Dispersiones

  • Escala original:
df_turistas %>% 
  select(where(is.numeric)) %>% 
  plot_scatterplot(by = "total_cost", ncol = 2)

  • Escala logarĆ­tmica:
df_turistas %>% 
  select(where(is.numeric)) %>% 
  plot_scatterplot(by = "total_cost", scale_x = "log10", ncol = 2)

  • Agregando relaciones lineales y no lineales
df_turistas %>% 
  select(where(is.numeric)) %>% 
  pivot_longer(cols = -total_cost) %>% 
  ggplot(aes(x = value, y = total_cost)) +
  facet_wrap(~name, scales = "free") +
  #geom_point() +
  scale_x_log10() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x + I(x^2), color = "red") +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ ns(x, df = 2), color = "forestgreen") +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ ns(x, knots = 1), color = "yellow")

Train - Test

set.seed(2022)
particion_inicial <- initial_split(data = df_turistas, prop = 0.8, strata = total_cost)
train <- training(particion_inicial)
test <- testing(particion_inicial)

Validación cruzada (k-fold)

set.seed(2022)
submuestras_reg <- vfold_cv(data = train, v = 5, repeats = 1, strata = total_cost)

Modelos

# Modelo de regresión lineal múltiple
mod_multiple <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")

# Modelo lineal regularizado: Ridge
mod_lasso <- linear_reg(mixture = 1, penalty = 0.5) %>%
  set_mode("regression") %>%
  set_engine("glmnet")


# Modelo lineal regularizado: Lasso
mod_ridge <- linear_reg(mixture = 0, penalty = 0.5) %>%
  set_mode("regression") %>%
  set_engine("glmnet")

# Modelo lineal regularizado: ElasticNet
mod_enet <- linear_reg(mixture = 0.5, penalty = 0.5) %>%
  set_mode("regression") %>%
  set_engine("glmnet")

Recetas de preprocesamiento

# Receta bƔsica
rec_basica <- recipe(total_cost ~ ., data = train) %>% 
  update_role(ID, new_role = "ID") %>% 
  step_impute_median(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

# Receta con polinomios
rec_poli <- recipe(total_cost ~ ., data = train) %>% 
  update_role(ID, new_role = "ID") %>% 
  step_impute_median(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  step_poly(night_mainland, degree = 2) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

# Receta con natural splines
rec_spline <- recipe(total_cost ~ ., data = train) %>% 
  update_role(ID, new_role = "ID") %>% 
  step_impute_median(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  step_ns(night_mainland, deg_free = 2) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

# Receta para regresión regularizada
rec_glmnet <- recipe(total_cost ~ ., data = train) %>% 
  update_role(ID, new_role = "ID") %>% 
  step_impute_median(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  step_poly(night_zanzibar, degree = 2) %>% 
  step_ns(night_mainland, deg_free = 2) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
  step_normalize(all_numeric_predictors())

Ajuste de modelos

Modelo mĆŗltiple

  • Receta bĆ”sica:
mult_basica <- fit_resamples(mod_multiple, rec_basica, submuestras_reg) 
mult_basica %>% collect_metrics()
  • Receta con polinomio:
mult_poli <- fit_resamples(mod_multiple, rec_poli, submuestras_reg) 
mult_poli %>% collect_metrics()
  • Receta con splines:
mult_spline <- fit_resamples(mod_multiple, rec_spline, submuestras_reg) 
mult_spline %>% collect_metrics()

Modelo Lasso

lasso_fit <- fit_resamples(mod_lasso, rec_glmnet, submuestras_reg) 
lasso_fit %>% collect_metrics()

Modelo Ridge

ridge_fit <- fit_resamples(mod_ridge, rec_glmnet, submuestras_reg) 
ridge_fit %>% collect_metrics()

Modelo ElasticNet

enet_fit <- fit_resamples(mod_enet, rec_glmnet, submuestras_reg) 
enet_fit %>% collect_metrics()

Comparando modelos

Tabla

df_rlm <-
  mult_basica %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "RLM")

df_poli <-
  mult_poli %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "RLM - Polinomial")

df_spline <-
  mult_spline %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "RLM - Splines")

df_lasso <-
  lasso_fit %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "Regresión Lasso")

df_ridge <-
  ridge_fit %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "Regresión Ridge")

df_enet <-
  enet_fit %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "Regresión ElasticNet")

df_resultados <- bind_rows(df_rlm, df_poli, df_spline, df_lasso, df_ridge, df_enet)
df_resultados

GrƔfico

df_resultados %>% 
  ggplot(aes(x = modelo, y = mean)) +
  facet_wrap(~.metric, scales = "free") +
  geom_point() +
  coord_flip()

Modelo final: Ridge

Ajuste

# Datos preprocesados para train
datos_preprocesados_train <- rec_glmnet %>% 
  prep() %>% 
  bake(new_data = train) %>% 
  select(-ID)

final_ridge <- mod_ridge %>% 
  fit(total_cost ~ ., data = datos_preprocesados_train)

Predicciones en test

# Datos preprocesados para test
datos_preprocesados_test <- rec_glmnet %>% 
  prep() %>% 
  bake(new_data = test)

predichos_test <- predict(final_ridge, new_data = datos_preprocesados_test)
predichos_test
  • RMSE en test:
y_reales <- test %>% pull(total_cost)

rmse_vec(truth = y_reales, estimate = predichos_test$.pred)
## [1] 9809391
  • R^2 en test:
rsq_trad_vec(truth = y_reales, estimate = predichos_test$.pred)
## [1] 0.342528

Importancia de variables

final_ridge %>%
  vi() %>%
  slice(1:20) %>% 
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL)

Ejemplo de clasificación

Datos iniciales

df_cardio <-
  read_csv("heart_failure_clinical_records_dataset.csv") %>%
  mutate(across(
    c(anaemia, diabetes, high_blood_pressure, sex, smoking, DEATH_EVENT),
    as.factor
  ))

df_cardio

Exploratorio

Valores ausentes

df_cardio %>% 
  vis_miss()

ĀæBalance de clases?

df_cardio %>% 
  count(DEATH_EVENT) %>% 
  ggplot(aes(x = DEATH_EVENT, y = n)) +
  geom_col() +
  geom_label(aes(label = n))

Distribuciones numƩricas

  • Escala original:
df_cardio %>% 
  select(DEATH_EVENT, where(is.numeric)) %>% 
  pivot_longer(cols = -DEATH_EVENT) %>% 
  ggplot(aes(x = value, fill = DEATH_EVENT)) +
  facet_wrap(~name, scales = "free") +
  geom_density(alpha = 0.5)

  • Transformación logarĆ­tmica:
df_cardio %>% 
  select(DEATH_EVENT, where(is.numeric)) %>% 
  pivot_longer(cols = -DEATH_EVENT) %>% 
  ggplot(aes(x = value, fill = DEATH_EVENT)) +
  facet_wrap(~name, scales = "free") +
  geom_density(alpha = 0.5) +
  scale_x_log10()

Distribuciones categóricas

  • Opción 1: frecuencias absolutas
df_cardio %>% 
  select(where(is.factor)) %>% 
  pivot_longer(cols = -DEATH_EVENT) %>% 
  count(name, value, DEATH_EVENT) %>% 
  ggplot(aes(x = value, y = n, fill = DEATH_EVENT)) +
  facet_wrap(~name, scales = "free") +
  geom_col(position = "dodge")

  • Opción 1: frecuencias relativas
df_cardio %>% 
  select(where(is.factor)) %>% 
  pivot_longer(cols = -DEATH_EVENT) %>% 
  count(name, value, DEATH_EVENT) %>% 
  ggplot(aes(x = value, y = n, fill = DEATH_EVENT)) +
  facet_wrap(~name, scales = "free") +
  geom_col(position = "fill")

Dispersiones

  • Age vs serum_creatinine: escala original
df_cardio %>% 
  ggplot(aes(x = age, y = serum_creatinine , color = DEATH_EVENT)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) 

  • Age vs serum_creatinine: escala logarĆ­tmica de Y
df_cardio %>% 
  ggplot(aes(x = age, y = serum_creatinine , color = DEATH_EVENT)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_log10()

PCA (CP1 VS CP2)

datos_pca <- df_cardio
acp <- PCA(X = datos_pca %>% select(where(is.numeric)), graph = FALSE)

datos_pca$cp1 <- acp$ind$coord[, 1]
datos_pca$cp2 <- acp$ind$coord[, 2]

datos_pca %>% 
  ggplot(aes(x = cp1, y = cp2, color = DEATH_EVENT)) +
  geom_hline(yintercept = 0, lty = 2, color = "black") +
  geom_vline(xintercept = 0, lty = 2, color = "black") +
  geom_point()

Train - Test

set.seed(2022)
particion_inicial_c <- initial_split(data = df_cardio, prop = 0.8, strata = DEATH_EVENT)
train_c <- training(particion_inicial_c)
test_c <- testing(particion_inicial_c)

Validación cruzada (k-fold)

set.seed(2022)
submuestras_kfold_c <- vfold_cv(data = train_c, v = 5, repeats = 1, strata = DEATH_EVENT)

Modelos

# Modelo de regresión logística
mod_logi <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")

# Modelo lineal regularizado: Ridge
mod_lasso_c <- logistic_reg(mixture = 1, penalty = 0.5) %>%
  set_mode("classification") %>%
  set_engine("glmnet")


# Modelo lineal regularizado: Ridge
mod_ridge_c <- logistic_reg(mixture = 0, penalty = 0.5) %>%
  set_mode("classification") %>%
  set_engine("glmnet")

# Modelo lineal regularizado: ElasticNet
mod_enet_c <- logistic_reg(mixture = 0.5, penalty = 0.5) %>%
  set_mode("classification") %>%
  set_engine("glmnet")

Recetas de preprocesamiento

# Receta bƔsica
rec_basica_c <- recipe(DEATH_EVENT ~ ., data = train_c) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

# Receta con trasnformaciones  y balanceo de clases ascendente
rec_trasnformer1_c <- recipe(DEATH_EVENT ~ ., data = train_c) %>% 
  step_YeoJohnson(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_upsample(DEATH_EVENT)

# Receta con trasnformaciones  y balanceo de clases descendente
rec_trasnformer2_c <- recipe(DEATH_EVENT ~ ., data = train_c) %>% 
  step_YeoJohnson(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_downsample(DEATH_EVENT)

Ajuste de modelos

R. LogĆ­stica

  • Receta bĆ”sica:
logi_basica <- fit_resamples(mod_logi, rec_basica_c, submuestras_kfold_c) 
logi_basica %>% collect_metrics()
  • Receta con muestreo ascendente:
logi_asc <- fit_resamples(mod_logi, rec_trasnformer1_c, submuestras_kfold_c) 
logi_asc %>% collect_metrics()
  • Receta con muestreo descendente:
logi_desc <- fit_resamples(mod_logi, rec_trasnformer2_c, submuestras_kfold_c) 
logi_desc %>% collect_metrics()

R. LogĆ­stica - Lasso

  • Receta bĆ”sica:
lasso_basica <- fit_resamples(mod_lasso_c, rec_basica_c, submuestras_kfold_c) 
lasso_basica %>% collect_metrics()
  • Receta con muestreo ascendente:
lasso_asc <- fit_resamples(mod_lasso_c, rec_trasnformer1_c, submuestras_kfold_c) 
lasso_asc %>% collect_metrics()
  • Receta con muestreo descendente:
lasso_desc <- fit_resamples(mod_lasso_c, rec_trasnformer2_c, submuestras_kfold_c) 
lasso_desc %>% collect_metrics()

R. LogĆ­stica - Ridge

  • Receta bĆ”sica:
ridge_basica <- fit_resamples(mod_ridge_c, rec_basica_c, submuestras_kfold_c) 
ridge_basica %>% collect_metrics()
  • Receta con muestreo ascendente:
ridge_asc <- fit_resamples(mod_ridge_c, rec_trasnformer1_c, submuestras_kfold_c) 
ridge_asc %>% collect_metrics()
  • Receta con muestreo descendente:
ridge_desc <- fit_resamples(mod_ridge_c, rec_trasnformer2_c, submuestras_kfold_c) 
ridge_desc %>% collect_metrics()

R. LogĆ­stica - ElasticNet

  • Receta bĆ”sica:
enet_basica <- fit_resamples(mod_enet_c, rec_basica_c, submuestras_kfold_c) 
enet_basica %>% collect_metrics()
  • Receta con muestreo ascendente:
enet_asc <- fit_resamples(mod_enet_c, rec_trasnformer1_c, submuestras_kfold_c) 
enet_asc %>% collect_metrics()
  • Receta con muestreo descendente:
enet_desc <- fit_resamples(mod_enet_c, rec_trasnformer2_c, submuestras_kfold_c) 
enet_desc %>% collect_metrics()

Comparando modelos

Tabla

df_glm_basica <-
  logi_basica %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM - BƔsica")

df_glm_asc <-
  logi_asc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM - Ascendente")

df_glm_desc <-
  logi_desc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM - Descendente")

df_lasso_basic <-
  lasso_basica %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM Lasso - BƔsica")

df_lasso_asc <-
  lasso_asc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM Lasso - Ascendente")

df_lasso_desc <-
  lasso_desc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM Lasso - Descendente")

df_ridge_basic <-
  ridge_basica %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM Ridge - BƔsica")

df_ridge_asc <-
  ridge_asc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM Ridge - Ascendente")

df_ridge_desc <-
  ridge_desc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM Ridge - Descendente")

df_enet_basic <-
  enet_basica %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM ElasticNet - BƔsica")

df_enet_asc <-
  enet_asc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM ElasticNet - Ascendente")

df_enet_desc <-
  enet_desc %>%
  collect_metrics() %>%
  select(.metric, mean) %>%
  mutate(modelo = "GLM ElasticNet - Descendente")

df_resultados_c <- bind_rows(
  df_glm_basica, df_glm_asc, df_glm_desc,
  df_lasso_basic, df_lasso_asc, df_lasso_desc,
  df_ridge_basic, df_ridge_asc, df_ridge_desc,
  df_enet_basic, df_enet_asc, df_enet_desc
)
df_resultados_c

GrƔfico

df_resultados_c %>% 
  ggplot(aes(x = modelo, y = mean)) +
  facet_wrap(~.metric, scales = "free") +
  geom_point() +
  coord_flip()

Modelo final: Ridge Ascendente

Ajuste

# Datos preprocesados para train
datos_preprocesados_train_c <- rec_trasnformer1_c %>% 
  prep() %>% 
  bake(new_data = train_c) 

final_ridge_c <- mod_ridge_c %>% 
  fit(DEATH_EVENT ~ ., data = datos_preprocesados_train_c)

Predicción en test

  • Predicción de clases o etiquetas:
# Datos preprocesados para test
datos_preprocesados_test_c <- rec_trasnformer1_c %>% 
  prep() %>% 
  bake(new_data = test_c)

predichos_test_c <- predict(final_ridge_c, new_data = datos_preprocesados_test_c)
predichos_test_c
  • Predicción de probabilidades:
predichos_test_c_prob <-
  predict(final_ridge_c, new_data = datos_preprocesados_test_c, type = "prob")
predichos_test_c_prob

Error en test

  • Matriz de confusión:
resultados_test <-
  data.frame(reales = test_c$DEATH_EVENT,
             predichos = predichos_test_c$.pred_class)

resultados_test %>% 
  conf_mat(truth = reales, estimate = predichos)
##           Truth
## Prediction  0  1
##          0 41 18
##          1  0  2
  • GrĆ”fico de matriz de confusión:
resultados_test %>% 
  conf_mat(truth = reales, estimate = predichos) %>% 
  autoplot()

  • Curva ROC (para la etiqueta 0):
df_roc <- data.frame(reales = test_c$DEATH_EVENT,
                     predichos_prob0 = predichos_test_c_prob$.pred_0)

df_roc %>% 
  roc_curve(truth = reales, estimate = predichos_prob0, event_level = "first") %>% 
  autoplot()

  • Ɓrea bajo la curva ROC (para la etiqueta 0):
df_roc %>% 
  roc_auc(truth = reales, estimate = predichos_prob0, event_level = "first") 
  • Ɓrea bajo la curva ROC (para la etiqueta 1):
df_roc %>% 
  roc_auc(truth = reales, estimate = predichos_prob0, event_level = "second") 

Importancia de variables

final_ridge_c %>%
  vi() %>%
  slice(1:20) %>% 
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL)

Matriz de confusión