data <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-11/hotels.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##Explore data

skimr::skim(data)
Data summary
Name data
Number of rows 119390
Number of columns 32
_______________________
Column type frequency:
character 13
Date 1
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hotel 0 1 10 12 0 2 0
arrival_date_month 0 1 3 9 0 12 0
meal 0 1 2 9 0 5 0
country 0 1 2 4 0 178 0
market_segment 0 1 6 13 0 8 0
distribution_channel 0 1 3 9 0 5 0
reserved_room_type 0 1 1 1 0 10 0
assigned_room_type 0 1 1 1 0 12 0
deposit_type 0 1 10 10 0 3 0
agent 0 1 1 4 0 334 0
company 0 1 1 4 0 353 0
customer_type 0 1 5 15 0 4 0
reservation_status 0 1 7 9 0 3 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
is_canceled 0 1 0.37 0.48 0.00 0.00 0.00 1 1 ▇▁▁▁▅
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737 ▇▂▁▁▁
arrival_date_year 0 1 2016.16 0.71 2015.00 2016.00 2016.00 2017 2017 ▃▁▇▁▆
arrival_date_week_number 0 1 27.17 13.61 1.00 16.00 28.00 38 53 ▅▇▇▇▅
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31 ▇▇▇▇▆
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19 ▇▁▁▁▁
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50 ▇▁▁▁▁
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55 ▇▁▁▁▁
children 4 1 0.10 0.40 0.00 0.00 0.00 0 10 ▇▁▁▁▁
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10 ▇▁▁▁▁
is_repeated_guest 0 1 0.03 0.18 0.00 0.00 0.00 0 1 ▇▁▁▁▁
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26 ▇▁▁▁▁
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72 ▇▁▁▁▁
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21 ▇▁▁▁▁
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391 ▇▁▁▁▁
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400 ▇▁▁▁▁
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8 ▇▁▁▁▁
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5 ▇▁▁▁▁
# Create a vector of column names to convert to factors
factors_vec <- data %>% 
  select(hotel, arrival_date_month, meal, market_segment,
         distribution_channel, reserved_room_type, deposit_type,
         customer_type, is_repeated_guest, is_canceled) %>% 
  names()

# Clean the data
data_clean <- data %>%
  
  # Address factors
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop problem variables
  select(-company, -agent, -reservation_status, -country, -assigned_room_type)

#explore data

data_clean %>% count(is_canceled)
## # A tibble: 2 × 2
##   is_canceled     n
##   <fct>       <int>
## 1 0           75166
## 2 1           44224
data_clean %>% 
  ggplot(aes(is_canceled)) + 
  geom_bar()

cancellation vs lead time

data_clean %>% 
  ggplot(aes(is_canceled, lead_time)) + 
  geom_boxplot()

correlation plot

data_binarized <- data_clean %>%
  drop_na() %>%
  select(-reservation_status_date) %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 119,386
## Columns: 99
## $ hotel__City_Hotel                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ hotel__Resort_Hotel                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_canceled__0                           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ is_canceled__1                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ `lead_time__-Inf_18`                     <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ lead_time__18_69                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ lead_time__69_160                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ lead_time__160_Inf                       <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2015                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_year__2016                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2017                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__April                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__August               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__December             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__February             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__January              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__July                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_month__June                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__March                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__May                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__November             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__October              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__September            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_week_number__-Inf_16`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__16_28          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_week_number__28_38          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__38_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_day_of_month__-Inf_8`      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_day_of_month__8_16          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__16_23         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__23_Inf        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_weekend_nights__-Inf_1`        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stays_in_weekend_nights__1_2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ stays_in_weekend_nights__2_Inf           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_week_nights__-Inf_1`           <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,…
## $ stays_in_week_nights__1_2                <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,…
## $ stays_in_week_nights__2_3                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ stays_in_week_nights__3_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adults__-Inf_2`                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ adults__2_Inf                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__0                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ children__1                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__2                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `children__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ babies__0                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `babies__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ meal__BB                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ meal__HB                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ meal__SC                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `meal__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ market_segment__Corporate                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ market_segment__Direct                   <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ market_segment__Groups                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `market_segment__Offline_TA/TO`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ market_segment__Online_TA                <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,…
## $ `market_segment__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Corporate          <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Direct             <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ `distribution_channel__TA/TO`            <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,…
## $ `distribution_channel__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ is_repeated_guest__0                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_repeated_guest__1                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_cancellations__0                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_cancellations__1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_cancellations__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_bookings_not_canceled__0        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_bookings_not_canceled__1        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_bookings_not_canceled__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__A                    <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ reserved_room_type__D                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ reserved_room_type__E                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__F                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__G                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `reserved_room_type__-OTHER`             <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ booking_changes__0                       <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ booking_changes__1                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ booking_changes__2                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `booking_changes__-OTHER`                <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ deposit_type__No_Deposit                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ deposit_type__Non_Refund                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `deposit_type__-OTHER`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ days_in_waiting_list__0                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `days_in_waiting_list__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Contract                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Transient                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `customer_type__Transient-Party`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `customer_type__-OTHER`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adr__-Inf_69.29`                        <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ adr__69.29_94.59                         <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,…
## $ adr__94.59_126                           <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ adr__126_Inf                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ required_car_parking_spaces__0           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ required_car_parking_spaces__1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `required_car_parking_spaces__-OTHER`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__0             <dbl> 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,…
## $ total_of_special_requests__1             <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,…
## $ total_of_special_requests__2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__3             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `total_of_special_requests__-OTHER`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
data_correlation <- data_binarized %>%
  correlate(target = is_canceled__1)

data_correlation
## # A tibble: 99 × 3
##    feature                   bin        correlation
##    <fct>                     <chr>            <dbl>
##  1 is_canceled               0               -1    
##  2 is_canceled               1                1    
##  3 deposit_type              Non_Refund       0.481
##  4 deposit_type              No_Deposit      -0.478
##  5 previous_cancellations    1                0.275
##  6 previous_cancellations    0               -0.271
##  7 lead_time                 -Inf_18         -0.270
##  8 total_of_special_requests 0                0.265
##  9 market_segment            Groups           0.222
## 10 lead_time                 160_Inf          0.220
## # ℹ 89 more rows
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()

#split data

set.seed(1234)

data_clean <- data_clean %>%
  drop_na() %>%
  sample_n(5000)

data_split <- initial_split(data_clean, strata = is_canceled)
data_train <- training(data_split)
data_test  <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = is_canceled)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3374/376]> Fold01
##  2 <split [3374/376]> Fold02
##  3 <split [3375/375]> Fold03
##  4 <split [3375/375]> Fold04
##  5 <split [3375/375]> Fold05
##  6 <split [3375/375]> Fold06
##  7 <split [3375/375]> Fold07
##  8 <split [3375/375]> Fold08
##  9 <split [3376/374]> Fold09
## 10 <split [3376/374]> Fold10

Preprocess data

xgboost_rec <- recipe(is_canceled ~ ., data = data_train) %>%
  update_role(reservation_status_date, new_role = "ID") %>%
  step_dummy(all_nominal_predictors()) %>%
  step_smote(is_canceled)

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 4,764
## Columns: 60
## $ lead_time                      <dbl> 7, 0, 27, 107, 1, 186, 82, 38, 17, 109,…
## $ arrival_date_year              <dbl> 2017, 2017, 2017, 2016, 2016, 2017, 201…
## $ arrival_date_week_number       <dbl> 8, 8, 9, 46, 3, 34, 40, 7, 38, 33, 11, …
## $ arrival_date_day_of_month      <dbl> 23, 24, 28, 6, 14, 20, 29, 12, 13, 16, …
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 1, 0, 0, …
## $ stays_in_week_nights           <dbl> 1, 1, 1, 2, 1, 2, 8, 5, 4, 4, 2, 2, 1, …
## $ adults                         <dbl> 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 2, 1, 1, …
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ booking_changes                <dbl> 1, 1, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, …
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ adr                            <dbl> 75.00, 88.00, 56.00, 117.28, 66.70, 117…
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <dbl> 1, 0, 2, 0, 0, 0, 1, 2, 0, 1, 0, 1, 2, …
## $ reservation_status_date        <date> 2017-02-24, 2017-02-25, 2017-03-01, 20…
## $ is_canceled                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hotel_Resort.Hotel             <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, …
## $ arrival_date_month_August      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ arrival_date_month_December    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_February    <dbl> 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, …
## $ arrival_date_month_January     <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_July        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_June        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_March       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ arrival_date_month_May         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_November    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_October     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_September   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, …
## $ meal_FB                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal_HB                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal_SC                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ meal_Undefined                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Complementary   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Corporate       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Direct          <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ market_segment_Groups          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ market_segment_Offline.TA.TO   <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Online.TA       <dbl> 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ market_segment_Undefined       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ distribution_channel_Direct    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ distribution_channel_GDS       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ distribution_channel_TA.TO     <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, …
## $ distribution_channel_Undefined <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ is_repeated_guest_X1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_B           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_C           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_D           <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_E           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ reserved_room_type_F           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_G           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_H           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_L           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_P           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type_Non.Refund        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type_Refundable        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type_Group            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type_Transient        <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ customer_type_Transient.Party  <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …

#Specify model

xgboost_spec <- 
  boost_tree(
    trees = tune(),
    min_n = tune(),
    tree_depth = tune(),
    learn_rate = tune(),
    loss_reduction = tune(),
    sample_size = tune()
  ) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

##tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)

xgboost_tune <-
  tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5
  )