Bibliotecas
library(tidyverse)
library(tidymodels)
Ejemplo regresión
Datos iniciales
df_turistas <- read_csv("Train.csv")
df_turistas %>% head()
Remuestreo
Partición inicial
set.seed(2022)
particion_inicial <- initial_split(data = df_turistas, prop = 0.8, strata = total_cost)
train <- training(particion_inicial)
test <- testing(particion_inicial)
- ¿La distribución de la variable respuesta quedó similar para el
train y el test?
train %>%
pull(total_cost) %>%
range()
## [1] 49000 99532875
test %>%
pull(total_cost) %>%
range()
## [1] 50000 95992659
Validación cruzada (k-fold)
set.seed(2022)
submuestras_kfold <- vfold_cv(data = df_turistas, v = 5, repeats = 1, strata = total_cost)
- Puedo acceder a la primera partición del total de k-folds:
submuestras_kfold %>% slice(1) %>% pull(splits)
## [[1]]
## <Analysis/Assess/Total>
## <3845/964/4809>
Remuestreo Bootstrapp
set.seed(2022)
submuestras_boots <- bootstraps(data = df_turistas, times = 5, strata = total_cost)
Arquitecturas de
modelos
Modelo con lm()
df_modelos <- df_turistas %>%
select(-ID) %>%
na.omit()
modelo_lm <- lm(total_cost ~ ., data = df_modelos)
Modelo con linear_reg()
# Arquitectura con lm
modelo_tidy_lm <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
# Arquitectura con glmnet
modelo_tidy_glmnet <- linear_reg() %>%
set_mode("regression") %>%
set_engine("glmnet")
# Arquitectura con keras
modelo_tidy_keras <- linear_reg() %>%
set_mode("regression") %>%
set_engine("keras")
- Imprimo las arquitectura de modelos de regresión lineal:
modelo_tidy_lm
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
modelo_tidy_glmnet
## Linear Regression Model Specification (regression)
##
## Computational engine: glmnet
modelo_tidy_keras
## Linear Regression Model Specification (regression)
##
## Computational engine: keras
- Ajustando modelo con la arquitectura “lm” a través de la función
fit:
ajuste_tidy_lm <-
modelo_tidy_lm %>%
fit(total_cost ~ ., data = df_modelos)
Ejemplo clasificación
Datos iniciales
df_cardio <-
read_csv("heart_failure_clinical_records_dataset.csv") %>%
mutate(across(
c(anaemia, diabetes, high_blood_pressure, sex, smoking, DEATH_EVENT),
as.factor
))
df_cardio
Remuestreo
Partición inicial
set.seed(2022)
particion_inicial_c <- initial_split(data = df_cardio, prop = 0.8, strata = DEATH_EVENT)
train_c <- training(particion_inicial_c)
test_c <- testing(particion_inicial_c)
- Distribución de la variable respuesta en train:
train_c %>%
pull(DEATH_EVENT) %>%
table() %>%
prop.table()
## .
## 0 1
## 0.6806723 0.3193277
- Distribución de la variable respuesta en test:
test_c %>%
pull(DEATH_EVENT) %>%
table() %>%
prop.table()
## .
## 0 1
## 0.6721311 0.3278689
Validación cruzada (k-fold)
set.seed(2022)
submuestras_kfold_c <- vfold_cv(data = df_cardio, v = 5, repeats = 1, strata = DEATH_EVENT)
Remuestreo Boosttrapp
set.seed(2022)
submuestras_boots_c <- bootstraps(data = df_cardio, times = 5, strata = DEATH_EVENT)
Arquitectura de modelos
Modelo glm()
modelo_logi <- glm(DEATH_EVENT ~ ., data = df_cardio, family = "binomial")
summary(modelo_logi)
##
## Call:
## glm(formula = DEATH_EVENT ~ ., family = "binomial", data = df_cardio)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1848 -0.5706 -0.2401 0.4466 2.6668
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.018e+01 5.657e+00 1.801 0.071774 .
## age 4.742e-02 1.580e-02 3.001 0.002690 **
## anaemia1 -7.470e-03 3.605e-01 -0.021 0.983467
## creatinine_phosphokinase 2.222e-04 1.779e-04 1.249 0.211684
## diabetes1 1.451e-01 3.512e-01 0.413 0.679380
## ejection_fraction -7.666e-02 1.633e-02 -4.695 2.67e-06 ***
## high_blood_pressure1 -1.027e-01 3.587e-01 -0.286 0.774688
## platelets -1.200e-06 1.889e-06 -0.635 0.525404
## serum_creatinine 6.661e-01 1.815e-01 3.670 0.000242 ***
## serum_sodium -6.698e-02 3.974e-02 -1.686 0.091855 .
## sex1 -5.337e-01 4.139e-01 -1.289 0.197299
## smoking1 -1.349e-02 4.126e-01 -0.033 0.973915
## time -2.104e-02 3.014e-03 -6.981 2.92e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 375.35 on 298 degrees of freedom
## Residual deviance: 219.55 on 286 degrees of freedom
## AIC: 245.55
##
## Number of Fisher Scoring iterations: 6
Modelo con logistic_reg()
# Arquitectura con glm
modelo_clas_glm <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
# Arquitectura con glmnet
modelo_clas_glmnet <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glmnet")
# Arquitectura con keras
modelo_clas_keras <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("keras")