data <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-11/hotels.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##Explore data

skimr::skim(data)
Data summary
Name data
Number of rows 119390
Number of columns 32
_______________________
Column type frequency:
character 13
Date 1
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hotel 0 1 10 12 0 2 0
arrival_date_month 0 1 3 9 0 12 0
meal 0 1 2 9 0 5 0
country 0 1 2 4 0 178 0
market_segment 0 1 6 13 0 8 0
distribution_channel 0 1 3 9 0 5 0
reserved_room_type 0 1 1 1 0 10 0
assigned_room_type 0 1 1 1 0 12 0
deposit_type 0 1 10 10 0 3 0
agent 0 1 1 4 0 334 0
company 0 1 1 4 0 353 0
customer_type 0 1 5 15 0 4 0
reservation_status 0 1 7 9 0 3 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
is_canceled 0 1 0.37 0.48 0.00 0.00 0.00 1 1 ▇▁▁▁▅
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737 ▇▂▁▁▁
arrival_date_year 0 1 2016.16 0.71 2015.00 2016.00 2016.00 2017 2017 ▃▁▇▁▆
arrival_date_week_number 0 1 27.17 13.61 1.00 16.00 28.00 38 53 ▅▇▇▇▅
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31 ▇▇▇▇▆
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19 ▇▁▁▁▁
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50 ▇▁▁▁▁
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55 ▇▁▁▁▁
children 4 1 0.10 0.40 0.00 0.00 0.00 0 10 ▇▁▁▁▁
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10 ▇▁▁▁▁
is_repeated_guest 0 1 0.03 0.18 0.00 0.00 0.00 0 1 ▇▁▁▁▁
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26 ▇▁▁▁▁
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72 ▇▁▁▁▁
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21 ▇▁▁▁▁
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391 ▇▁▁▁▁
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400 ▇▁▁▁▁
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8 ▇▁▁▁▁
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5 ▇▁▁▁▁
# Create a vector of column names to convert to factors
factors_vec <- data %>% 
  select(hotel, arrival_date_month, meal, market_segment,
         distribution_channel, reserved_room_type, deposit_type,
         customer_type, is_repeated_guest, is_canceled) %>% 
  names()

# Clean the data
data_clean <- data %>%
  
  # Address factors imported as character/numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop only the specific problem variables
  select(-company, -agent, -reservation_status, -country, -assigned_room_type) %>%
  
  # Rename target levels to match code along style
  mutate(is_canceled = if_else(is_canceled == 1, "Canceled", "Not_Canceled"),
         is_canceled = as.factor(is_canceled))

#explore data

# Explore data
data_clean %>% count(is_canceled)
## # A tibble: 2 × 2
##   is_canceled      n
##   <fct>        <int>
## 1 Canceled     44224
## 2 Not_Canceled 75166
data_clean %>% 
  ggplot(aes(is_canceled)) + 
  geom_bar()

cancellation vs lead time

# Visualize cancellations across lead time
data_clean %>% 
  ggplot(aes(is_canceled, lead_time)) + 
  geom_boxplot()

correlation plot

# Step 1: Binarize the data
data_binarized <- data_clean %>%
  drop_na() %>%
  select(-reservation_status_date) %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 119,386
## Columns: 99
## $ hotel__City_Hotel                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ hotel__Resort_Hotel                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_canceled__Canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ is_canceled__Not_Canceled                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ `lead_time__-Inf_18`                     <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ lead_time__18_69                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ lead_time__69_160                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ lead_time__160_Inf                       <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2015                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_year__2016                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2017                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__April                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__August               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__December             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__February             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__January              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__July                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_month__June                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__March                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__May                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__November             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__October              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__September            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_week_number__-Inf_16`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__16_28          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_week_number__28_38          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__38_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_day_of_month__-Inf_8`      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_day_of_month__8_16          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__16_23         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__23_Inf        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_weekend_nights__-Inf_1`        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stays_in_weekend_nights__1_2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ stays_in_weekend_nights__2_Inf           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_week_nights__-Inf_1`           <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,…
## $ stays_in_week_nights__1_2                <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,…
## $ stays_in_week_nights__2_3                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ stays_in_week_nights__3_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adults__-Inf_2`                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ adults__2_Inf                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__0                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ children__1                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__2                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `children__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ babies__0                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `babies__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ meal__BB                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ meal__HB                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ meal__SC                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `meal__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ market_segment__Corporate                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ market_segment__Direct                   <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ market_segment__Groups                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `market_segment__Offline_TA/TO`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ market_segment__Online_TA                <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,…
## $ `market_segment__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Corporate          <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Direct             <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ `distribution_channel__TA/TO`            <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,…
## $ `distribution_channel__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ is_repeated_guest__0                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_repeated_guest__1                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_cancellations__0                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_cancellations__1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_cancellations__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_bookings_not_canceled__0        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_bookings_not_canceled__1        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_bookings_not_canceled__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__A                    <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ reserved_room_type__D                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ reserved_room_type__E                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__F                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__G                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `reserved_room_type__-OTHER`             <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ booking_changes__0                       <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ booking_changes__1                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ booking_changes__2                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `booking_changes__-OTHER`                <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ deposit_type__No_Deposit                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ deposit_type__Non_Refund                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `deposit_type__-OTHER`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ days_in_waiting_list__0                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `days_in_waiting_list__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Contract                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Transient                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `customer_type__Transient-Party`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `customer_type__-OTHER`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adr__-Inf_69.29`                        <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ adr__69.29_94.59                         <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,…
## $ adr__94.59_126                           <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ adr__126_Inf                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ required_car_parking_spaces__0           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ required_car_parking_spaces__1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `required_car_parking_spaces__-OTHER`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__0             <dbl> 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,…
## $ total_of_special_requests__1             <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,…
## $ total_of_special_requests__2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__3             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `total_of_special_requests__-OTHER`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Step 2: Correlation analysis against the target variable
data_correlation <- data_binarized %>%
  correlate(target = is_canceled__Canceled)

data_correlation
## # A tibble: 99 × 3
##    feature                   bin          correlation
##    <fct>                     <chr>              <dbl>
##  1 is_canceled               Canceled           1    
##  2 is_canceled               Not_Canceled      -1    
##  3 deposit_type              Non_Refund         0.481
##  4 deposit_type              No_Deposit        -0.478
##  5 previous_cancellations    1                  0.275
##  6 previous_cancellations    0                 -0.271
##  7 lead_time                 -Inf_18           -0.270
##  8 total_of_special_requests 0                  0.265
##  9 market_segment            Groups             0.222
## 10 lead_time                 160_Inf            0.220
## # ℹ 89 more rows
#step 3:plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

#split data

set.seed(1234)

data_clean <- data_clean %>%
  drop_na() %>%
  sample_n(3000)

data_split <- initial_split(data_clean, strata = is_canceled)
data_train <- training(data_split)
data_test  <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, v = 3, strata = is_canceled)
data_cv
## #  3-fold cross-validation using stratification 
## # A tibble: 3 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [1500/750]> Fold1
## 2 <split [1500/750]> Fold2
## 3 <split [1500/750]> Fold3

Preprocess data

xgboost_rec <- recipes::recipe(is_canceled ~ ., data = data_train) %>%
  update_role(reservation_status_date, new_role = "ID") %>%
  step_dummy(all_nominal_predictors()) %>%
  step_smote(is_canceled)

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 2,856
## Columns: 60
## $ lead_time                      <dbl> 176, 119, 175, 103, 68, 10, 164, 29, 67…
## $ arrival_date_year              <dbl> 2016, 2016, 2017, 2016, 2016, 2016, 201…
## $ arrival_date_week_number       <dbl> 15, 45, 20, 17, 18, 15, 20, 21, 43, 43,…
## $ arrival_date_day_of_month      <dbl> 9, 4, 19, 19, 24, 8, 15, 18, 17, 20, 1,…
## $ stays_in_weekend_nights        <dbl> 1, 0, 0, 0, 2, 0, 1, 0, 1, 0, 1, 2, 0, …
## $ stays_in_week_nights           <dbl> 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 0, 1, …
## $ adults                         <dbl> 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, …
## $ children                       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ booking_changes                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ adr                            <dbl> 125.00, 79.20, 90.00, 115.00, 79.56, 13…
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <dbl> 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, …
## $ reservation_status_date        <date> 2015-11-23, 2016-11-04, 2016-11-25, 20…
## $ is_canceled                    <fct> Canceled, Canceled, Canceled, Canceled,…
## $ hotel_Resort.Hotel             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ arrival_date_month_August      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ arrival_date_month_December    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_February    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_January     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_July        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_June        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_March       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_May         <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, …
## $ arrival_date_month_November    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_October     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, …
## $ arrival_date_month_September   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal_FB                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal_HB                        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ meal_SC                        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ meal_Undefined                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Complementary   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Corporate       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Direct          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Groups          <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Offline.TA.TO   <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ market_segment_Online.TA       <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ market_segment_Undefined       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ distribution_channel_Direct    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ distribution_channel_GDS       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ distribution_channel_TA.TO     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ distribution_channel_Undefined <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ is_repeated_guest_X1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_B           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_C           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_D           <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ reserved_room_type_E           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ reserved_room_type_F           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_G           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_H           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_L           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_P           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type_Non.Refund        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ deposit_type_Refundable        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type_Group            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type_Transient        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ customer_type_Transient.Party  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

#Specify model

xgboost_spec <- 
  boost_tree(
    trees = tune(),
    min_n = tune(),
    tree_depth = tune(),
    learn_rate = tune(),
    loss_reduction = tune(),
    sample_size = tune()
  ) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

##tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <-
  tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 2,
    control = control_grid(save_pred = TRUE)
  )

#Model evaluation

collect_metrics(xgboost_tune)
## # A tibble: 6 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric    
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>      
## 1     1    40          1      0.316  31.6                  1   accuracy   
## 2     1    40          1      0.316  31.6                  1   brier_class
## 3     1    40          1      0.316  31.6                  1   roc_auc    
## 4  2000     2         15      0.001   0.0000000001         0.5 accuracy   
## 5  2000     2         15      0.001   0.0000000001         0.5 brier_class
## 6  2000     2         15      0.001   0.0000000001         0.5 roc_auc    
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(is_canceled, .pred_Canceled) %>%
    autoplot()

#Fit model for last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config        
##   <chr>       <chr>          <dbl> <chr>          
## 1 accuracy    binary         0.821 pre0_mod0_post0
## 2 roc_auc     binary         0.877 pre0_mod0_post0
## 3 brier_class binary         0.132 pre0_mod0_post0
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(is_canceled, .pred_class) %>%
    autoplot()

#variable importance

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()