Loading packages

library(tidyverse)
library(tidymodels)
library(vip)
theme_set(theme_bw())

We will build a model to predict which actual hotel stays included children and/or babies, and which did not.

Loading dataset

hotels <- 
  read_csv('https://tidymodels.org/start/case-study/hotels.csv') %>% 
  mutate_if(is.character, as.factor)

Checking data

glimpse(hotels)

Rows: 50,000
Columns: 23
$ hotel                          <fct> City_Hotel, City_Hotel, Resort_Hotel, R~
$ lead_time                      <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6~
$ stays_in_weekend_nights        <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, ~
$ stays_in_week_nights           <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, ~
$ adults                         <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, ~
$ children                       <fct> none, none, none, none, none, none, chi~
$ meal                           <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB,~
$ country                        <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP,~
$ market_segment                 <fct> Offline_TA/TO, Direct, Online_TA, Onlin~
$ distribution_channel           <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA~
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ reserved_room_type             <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, ~
$ assigned_room_type             <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, ~
$ booking_changes                <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ deposit_type                   <fct> No_Deposit, No_Deposit, No_Deposit, No_~
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ customer_type                  <fct> Transient-Party, Transient, Transient, ~
$ average_daily_rate             <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.~
$ required_car_parking_spaces    <fct> none, none, none, none, none, none, non~
$ total_of_special_requests      <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, ~
$ arrival_date                   <date> 2016-09-01, 2017-08-25, 2016-11-19, 20~

colSums(is.na(hotels))

                         hotel                      lead_time 
                             0                              0 
       stays_in_weekend_nights           stays_in_week_nights 
                             0                              0 
                        adults                       children 
                             0                              0 
                          meal                        country 
                             0                              0 
                market_segment           distribution_channel 
                             0                              0 
             is_repeated_guest         previous_cancellations 
                             0                              0 
previous_bookings_not_canceled             reserved_room_type 
                             0                              0 
            assigned_room_type                booking_changes 
                             0                              0 
                  deposit_type           days_in_waiting_list 
                             0                              0 
                 customer_type             average_daily_rate 
                             0                              0 
   required_car_parking_spaces      total_of_special_requests 
                             0                              0 
                  arrival_date 
                             0

Outcome variable ‘children’ is a factor variable with two levels

 hotels %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))

# A tibble: 2 x 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  4038 0.0808
2 none     45962 0.919

Data splitting

set.seed(2021)
hotels_split <- initial_split(hotels,
                              prop= .75,
                              strata = children)

hotel_train <- training(hotels_split)
hotel_test <- testing(hotels_split)

Data resampling (CV)

hotel_cv <- vfold_cv(hotel_train, v = 10)

Penalized model

pen_model <- 
  logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine('glmnet')

holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", 
              "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")

pen_rec <- 
  recipe(children ~ ., data = hotel_train) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date, holidays = holidays) %>% 
  step_rm(arrival_date) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors())

pen_wf <- 
  workflow() %>% 
  add_model(pen_model) %>% 
  add_recipe(pen_rec)

Grid generation

pen_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))

Run the model

pen_res <- 
  tune_grid(
    pen_wf,
    resamples = hotel_cv,
    grid = pen_reg_grid,
    control = control_grid(save_pred = T),
    metrics = metric_set(roc_auc)
  )
pen_res

# Tuning results
# 10-fold cross-validation 
# A tibble: 10 x 5
   splits             id     .metrics        .notes         .predictions        
   <list>             <chr>  <list>          <list>         <list>              
 1 <split [33750/375~ Fold01 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 2 <split [33750/375~ Fold02 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 3 <split [33750/375~ Fold03 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 4 <split [33750/375~ Fold04 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 5 <split [33750/375~ Fold05 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 6 <split [33750/375~ Fold06 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 7 <split [33750/375~ Fold07 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 8 <split [33750/375~ Fold08 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
 9 <split [33750/375~ Fold09 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~
10 <split [33750/375~ Fold10 <tibble [30 x ~ <tibble [0 x ~ <tibble [112,500 x ~

Metrics

pen_metrics <- 
  pen_res %>% 
  collect_metrics()
pen_metrics

# A tibble: 30 x 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 0.0001   roc_auc binary     0.872    10 0.00328 Preprocessor1_Model01
 2 0.000127 roc_auc binary     0.872    10 0.00329 Preprocessor1_Model02
 3 0.000161 roc_auc binary     0.873    10 0.00330 Preprocessor1_Model03
 4 0.000204 roc_auc binary     0.873    10 0.00331 Preprocessor1_Model04
 5 0.000259 roc_auc binary     0.873    10 0.00333 Preprocessor1_Model05
 6 0.000329 roc_auc binary     0.874    10 0.00334 Preprocessor1_Model06
 7 0.000418 roc_auc binary     0.874    10 0.00334 Preprocessor1_Model07
 8 0.000530 roc_auc binary     0.874    10 0.00332 Preprocessor1_Model08
 9 0.000672 roc_auc binary     0.875    10 0.00330 Preprocessor1_Model09
10 0.000853 roc_auc binary     0.875    10 0.00327 Preprocessor1_Model10
# ... with 20 more rows

Plots metrics

pen_metrics %>% 
  ggplot(aes(x = penalty, y= mean)) +
  geom_point() +
  geom_line() + 
  scale_x_log10(labels = scales::label_number())

## Best penalized models

top_models <- 
  pen_res %>% 
  show_best(metric = 'roc_auc') %>% 
  arrange(penalty)

top_models

# A tibble: 5 x 7
   penalty .metric .estimator  mean     n std_err .config              
     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1 0.000853 roc_auc binary     0.875    10 0.00327 Preprocessor1_Model10
2 0.00108  roc_auc binary     0.875    10 0.00326 Preprocessor1_Model11
3 0.00137  roc_auc binary     0.875    10 0.00326 Preprocessor1_Model12
4 0.00174  roc_auc binary     0.875    10 0.00323 Preprocessor1_Model13
5 0.00221  roc_auc binary     0.875    10 0.00315 Preprocessor1_Model14

pen_best <- pen_res %>% 
  select_best()
pen_best

# A tibble: 1 x 2
  penalty .config              
    <dbl> <chr>                
1 0.00137 Preprocessor1_Model12

AUC

pen_auc <- 
  pen_res %>% 
  collect_predictions(parameters = pen_best) %>% 
  roc_curve(children, .pred_children) %>% 
  mutate(model = "Logistic Regression")

autoplot(pen_auc)

Random Forest

cores <- parallel::detectCores()
cores

[1] 16

RF Model

rf_mod <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 500) %>% 
  set_engine("ranger", num.threads = cores) %>% 
  set_mode("classification")

RF Recipe

rf_rec <- 
  recipe(children ~ ., data = hotel_train) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date) %>% 
  step_rm(arrival_date)

RF workflow

rf_wf <- 
  workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(rf_rec)

rf_mod

Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = 500
  min_n = tune()

Engine-Specific Arguments:
  num.threads = cores

Computational engine: ranger

rf_wf

== Workflow ====================================================================
Preprocessor: Recipe
Model: rand_forest()

-- Preprocessor ----------------------------------------------------------------
3 Recipe Steps

* step_date()
* step_holiday()
* step_rm()

-- Model -----------------------------------------------------------------------
Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = 500
  min_n = tune()

Engine-Specific Arguments:
  num.threads = cores

Computational engine: ranger

rf_mod %>% parameters

Collection of 2 parameters for tuning

 identifier  type    object
       mtry  mtry nparam[?]
      min_n min_n nparam[+]

Model parameters needing finalization:
   # Randomly Selected Predictors ('mtry')

See `?dials::finalize` or `?dials::update.parameters` for more information.

Run RF model

set.seed(2021)
rf_res <- 
  tune_grid(rf_wf,
            resamples = hotel_cv,
            grid = 20,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(roc_auc))

Metrics

rf_res %>% show_best(metric = 'roc_auc')

# A tibble: 5 x 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     5     4 roc_auc binary     0.925    10 0.00152 Preprocessor1_Model19
2     8     6 roc_auc binary     0.923    10 0.00181 Preprocessor1_Model11
3     7    11 roc_auc binary     0.923    10 0.00187 Preprocessor1_Model05
4     5    13 roc_auc binary     0.922    10 0.00155 Preprocessor1_Model18
5    11     9 roc_auc binary     0.919    10 0.00203 Preprocessor1_Model02

rf_best <- rf_res %>% select_best(metric = 'roc_auc')

autoplot(rf_res)

rf_auc <- 
  rf_res %>% 
  collect_predictions(parameters = rf_best) %>% 
  roc_curve(children, .pred_children) %>% 
  mutate(model = "Random Forest")

bind_rows(rf_auc, pen_auc) %>% 
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) + 
  geom_path(lwd = 1.5, alpha = 0.8) +
  geom_abline(lty = 3) + 
  coord_equal() + 
  scale_color_viridis_d(option = "plasma", end = .6)

last_rf_mod <- 
  rand_forest(mtry = 8, min_n = 7, trees = 500) %>% 
  set_engine("ranger", num.threads = cores, importance = "impurity") %>% 
  set_mode("classification")

# the last workflow
last_rf_workflow <- 
  rf_wf %>% 
  update_model(last_rf_mod)

# the last fit
set.seed(2021)
last_rf_fit <- 
  last_rf_workflow %>% 
  last_fit(hotels_split)

last_rf_fit

# Resampling results
# Manual resampling 
# A tibble: 1 x 6
  splits         id          .metrics     .notes      .predictions     .workflow
  <list>         <chr>       <list>       <list>      <list>           <list>   
1 <split [37500~ train/test~ <tibble [2 ~ <tibble [0~ <tibble [12,500~ <workflo~

last_rf_fit %>% 
  collect_metrics()

# A tibble: 2 x 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.947 Preprocessor1_Model1
2 roc_auc  binary         0.925 Preprocessor1_Model1

last_rf_fit %>% 
  pluck(".workflow", 1) %>%   
  pull_workflow_fit() %>% 
  vip(num_features = 20)

last_rf_fit %>% 
  collect_predictions() %>% 
  roc_curve(children, .pred_children) %>% 
  autoplot()

Hotels

GS

4/8/2021

Checking data

Data splitting

Data resampling (CV)

Penalized model

Grid generation

Run the model

Metrics

Plots metrics

AUC

Random Forest

RF Model

RF Recipe

RF workflow

Run RF model

Metrics