Loading packages
library(tidyverse)
library(tidymodels)
library(vip)
theme_set(theme_bw())
We will build a model to predict which actual hotel stays included children and/or babies, and which did not.
Loading dataset
hotels <-
read_csv('https://tidymodels.org/start/case-study/hotels.csv') %>%
mutate_if(is.character, as.factor)
glimpse(hotels)
Rows: 50,000
Columns: 23
$ hotel <fct> City_Hotel, City_Hotel, Resort_Hotel, R~
$ lead_time <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6~
$ stays_in_weekend_nights <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, ~
$ stays_in_week_nights <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, ~
$ adults <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, ~
$ children <fct> none, none, none, none, none, none, chi~
$ meal <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB,~
$ country <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP,~
$ market_segment <fct> Offline_TA/TO, Direct, Online_TA, Onlin~
$ distribution_channel <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA~
$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ reserved_room_type <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, ~
$ assigned_room_type <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, ~
$ booking_changes <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ deposit_type <fct> No_Deposit, No_Deposit, No_Deposit, No_~
$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ customer_type <fct> Transient-Party, Transient, Transient, ~
$ average_daily_rate <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.~
$ required_car_parking_spaces <fct> none, none, none, none, none, none, non~
$ total_of_special_requests <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, ~
$ arrival_date <date> 2016-09-01, 2017-08-25, 2016-11-19, 20~
colSums(is.na(hotels))
hotel lead_time
0 0
stays_in_weekend_nights stays_in_week_nights
0 0
adults children
0 0
meal country
0 0
market_segment distribution_channel
0 0
is_repeated_guest previous_cancellations
0 0
previous_bookings_not_canceled reserved_room_type
0 0
assigned_room_type booking_changes
0 0
deposit_type days_in_waiting_list
0 0
customer_type average_daily_rate
0 0
required_car_parking_spaces total_of_special_requests
0 0
arrival_date
0
Outcome variable ‘children’ is a factor variable with two levels
hotels %>%
count(children) %>%
mutate(prop = n/sum(n))
# A tibble: 2 x 3
children n prop
<fct> <int> <dbl>
1 children 4038 0.0808
2 none 45962 0.919
set.seed(2021)
hotels_split <- initial_split(hotels,
prop= .75,
strata = children)
hotel_train <- training(hotels_split)
hotel_test <- testing(hotels_split)
hotel_cv <- vfold_cv(hotel_train, v = 10)
pen_model <-
logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine('glmnet')
holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter",
"ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")
pen_rec <-
recipe(children ~ ., data = hotel_train) %>%
step_date(arrival_date) %>%
step_holiday(arrival_date, holidays = holidays) %>%
step_rm(arrival_date) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
pen_wf <-
workflow() %>%
add_model(pen_model) %>%
add_recipe(pen_rec)
pen_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
pen_res <-
tune_grid(
pen_wf,
resamples = hotel_cv,
grid = pen_reg_grid,
control = control_grid(save_pred = T),
metrics = metric_set(roc_auc)
)
pen_res
# Tuning results
# 10-fold cross-validation
# A tibble: 10 x 5
splits id .metrics .notes .predictions
<list> <chr> <list> <list> <list>
1 <split [33750/375~ Fold01 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
2 <split [33750/375~ Fold02 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
3 <split [33750/375~ Fold03 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
4 <split [33750/375~ Fold04 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
5 <split [33750/375~ Fold05 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
6 <split [33750/375~ Fold06 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
7 <split [33750/375~ Fold07 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
8 <split [33750/375~ Fold08 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
9 <split [33750/375~ Fold09 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
10 <split [33750/375~ Fold10 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
pen_metrics <-
pen_res %>%
collect_metrics()
pen_metrics
# A tibble: 30 x 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0001 roc_auc binary 0.872 10 0.00328 Preprocessor1_Model01
2 0.000127 roc_auc binary 0.872 10 0.00329 Preprocessor1_Model02
3 0.000161 roc_auc binary 0.873 10 0.00330 Preprocessor1_Model03
4 0.000204 roc_auc binary 0.873 10 0.00331 Preprocessor1_Model04
5 0.000259 roc_auc binary 0.873 10 0.00333 Preprocessor1_Model05
6 0.000329 roc_auc binary 0.874 10 0.00334 Preprocessor1_Model06
7 0.000418 roc_auc binary 0.874 10 0.00334 Preprocessor1_Model07
8 0.000530 roc_auc binary 0.874 10 0.00332 Preprocessor1_Model08
9 0.000672 roc_auc binary 0.875 10 0.00330 Preprocessor1_Model09
10 0.000853 roc_auc binary 0.875 10 0.00327 Preprocessor1_Model10
# ... with 20 more rows
pen_metrics %>%
ggplot(aes(x = penalty, y= mean)) +
geom_point() +
geom_line() +
scale_x_log10(labels = scales::label_number())
## Best penalized models
top_models <-
pen_res %>%
show_best(metric = 'roc_auc') %>%
arrange(penalty)
top_models
# A tibble: 5 x 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.000853 roc_auc binary 0.875 10 0.00327 Preprocessor1_Model10
2 0.00108 roc_auc binary 0.875 10 0.00326 Preprocessor1_Model11
3 0.00137 roc_auc binary 0.875 10 0.00326 Preprocessor1_Model12
4 0.00174 roc_auc binary 0.875 10 0.00323 Preprocessor1_Model13
5 0.00221 roc_auc binary 0.875 10 0.00315 Preprocessor1_Model14
pen_best <- pen_res %>%
select_best()
pen_best
# A tibble: 1 x 2
penalty .config
<dbl> <chr>
1 0.00137 Preprocessor1_Model12
pen_auc <-
pen_res %>%
collect_predictions(parameters = pen_best) %>%
roc_curve(children, .pred_children) %>%
mutate(model = "Logistic Regression")
autoplot(pen_auc)
cores <- parallel::detectCores()
cores
[1] 16
rf_mod <-
rand_forest(mtry = tune(), min_n = tune(), trees = 500) %>%
set_engine("ranger", num.threads = cores) %>%
set_mode("classification")
rf_rec <-
recipe(children ~ ., data = hotel_train) %>%
step_date(arrival_date) %>%
step_holiday(arrival_date) %>%
step_rm(arrival_date)
rf_wf <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_rec)
rf_mod
Random Forest Model Specification (classification)
Main Arguments:
mtry = tune()
trees = 500
min_n = tune()
Engine-Specific Arguments:
num.threads = cores
Computational engine: ranger
rf_wf
== Workflow ====================================================================
Preprocessor: Recipe
Model: rand_forest()
-- Preprocessor ----------------------------------------------------------------
3 Recipe Steps
* step_date()
* step_holiday()
* step_rm()
-- Model -----------------------------------------------------------------------
Random Forest Model Specification (classification)
Main Arguments:
mtry = tune()
trees = 500
min_n = tune()
Engine-Specific Arguments:
num.threads = cores
Computational engine: ranger
rf_mod %>% parameters
Collection of 2 parameters for tuning
identifier type object
mtry mtry nparam[?]
min_n min_n nparam[+]
Model parameters needing finalization:
# Randomly Selected Predictors ('mtry')
See `?dials::finalize` or `?dials::update.parameters` for more information.
set.seed(2021)
rf_res <-
tune_grid(rf_wf,
resamples = hotel_cv,
grid = 20,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
rf_res %>% show_best(metric = 'roc_auc')
# A tibble: 5 x 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 4 roc_auc binary 0.925 10 0.00152 Preprocessor1_Model19
2 8 6 roc_auc binary 0.923 10 0.00181 Preprocessor1_Model11
3 7 11 roc_auc binary 0.923 10 0.00187 Preprocessor1_Model05
4 5 13 roc_auc binary 0.922 10 0.00155 Preprocessor1_Model18
5 11 9 roc_auc binary 0.919 10 0.00203 Preprocessor1_Model02
rf_best <- rf_res %>% select_best(metric = 'roc_auc')
autoplot(rf_res)
rf_auc <-
rf_res %>%
collect_predictions(parameters = rf_best) %>%
roc_curve(children, .pred_children) %>%
mutate(model = "Random Forest")
bind_rows(rf_auc, pen_auc) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = .6)
last_rf_mod <-
rand_forest(mtry = 8, min_n = 7, trees = 500) %>%
set_engine("ranger", num.threads = cores, importance = "impurity") %>%
set_mode("classification")
# the last workflow
last_rf_workflow <-
rf_wf %>%
update_model(last_rf_mod)
# the last fit
set.seed(2021)
last_rf_fit <-
last_rf_workflow %>%
last_fit(hotels_split)
last_rf_fit
# Resampling results
# Manual resampling
# A tibble: 1 x 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [37500~ train/test~ <tibble [2 ~ <tibble [0~ <tibble [12,500~ <workflo~
last_rf_fit %>%
collect_metrics()
# A tibble: 2 x 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.947 Preprocessor1_Model1
2 roc_auc binary 0.925 Preprocessor1_Model1
last_rf_fit %>%
pluck(".workflow", 1) %>%
pull_workflow_fit() %>%
vip(num_features = 20)
last_rf_fit %>%
collect_predictions() %>%
roc_curve(children, .pred_children) %>%
autoplot()