Apply 7

data <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-11/hotels.csv")

## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##Explore data

skimr::skim(data)

Data summary
Name	data
Number of rows	119390
Number of columns	32
_______________________
Column type frequency:
character	13
Date	1
numeric	18
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
hotel	1	10	12	2
arrival_date_month	1	3	9	12
meal	1	2	9	5
country	1	2	4	178
market_segment	1	6	13	8
distribution_channel	1	3	9	5
reserved_room_type	1	1	1	10
assigned_room_type	1	1	1	12
deposit_type	1	10	10	3
agent	1	1	4	334
company	1	1	4	353
customer_type	1	5	15	4
reservation_status	1	7	9	3

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
reservation_status_date	0	1	2014-10-17	2017-09-14	2016-08-07	926

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
is_canceled	0	1	0.37	0.48	0.00	0.00	0.00	1	1	▇▁▁▁▅
lead_time	0	1	104.01	106.86	0.00	18.00	69.00	160	737	▇▂▁▁▁
arrival_date_year	0	1	2016.16	0.71	2015.00	2016.00	2016.00	2017	2017	▃▁▇▁▆
arrival_date_week_number	0	1	27.17	13.61	1.00	16.00	28.00	38	53	▅▇▇▇▅
arrival_date_day_of_month	0	1	15.80	8.78	1.00	8.00	16.00	23	31	▇▇▇▇▆
stays_in_weekend_nights	0	1	0.93	1.00	0.00	0.00	1.00	2	19	▇▁▁▁▁
stays_in_week_nights	0	1	2.50	1.91	0.00	1.00	2.00	3	50	▇▁▁▁▁
adults	0	1	1.86	0.58	0.00	2.00	2.00	2	55	▇▁▁▁▁
children	4	1	0.10	0.40	0.00	0.00	0.00	0	10	▇▁▁▁▁
babies	0	1	0.01	0.10	0.00	0.00	0.00	0	10	▇▁▁▁▁
is_repeated_guest	0	1	0.03	0.18	0.00	0.00	0.00	0	1	▇▁▁▁▁
previous_cancellations	0	1	0.09	0.84	0.00	0.00	0.00	0	26	▇▁▁▁▁
previous_bookings_not_canceled	0	1	0.14	1.50	0.00	0.00	0.00	0	72	▇▁▁▁▁
booking_changes	0	1	0.22	0.65	0.00	0.00	0.00	0	21	▇▁▁▁▁
days_in_waiting_list	0	1	2.32	17.59	0.00	0.00	0.00	0	391	▇▁▁▁▁
adr	0	1	101.83	50.54	-6.38	69.29	94.58	126	5400	▇▁▁▁▁
required_car_parking_spaces	0	1	0.06	0.25	0.00	0.00	0.00	0	8	▇▁▁▁▁
total_of_special_requests	0	1	0.57	0.79	0.00	0.00	0.00	1	5	▇▁▁▁▁

# Create a vector of column names to convert to factors
factors_vec <- data %>% 
  select(hotel, arrival_date_month, meal, market_segment,
         distribution_channel, reserved_room_type, deposit_type,
         customer_type, is_repeated_guest, is_canceled) %>% 
  names()

# Clean the data
data_clean <- data %>%
  
  # Address factors imported as character/numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop only the specific problem variables
  select(-company, -agent, -reservation_status, -country, -assigned_room_type) %>%
  
  # Rename target levels to match code along style
  mutate(is_canceled = if_else(is_canceled == 1, "Canceled", "Not_Canceled"),
         is_canceled = as.factor(is_canceled))

#explore data

# Explore data
data_clean %>% count(is_canceled)

## # A tibble: 2 × 2
##   is_canceled      n
##   <fct>        <int>
## 1 Canceled     44224
## 2 Not_Canceled 75166

data_clean %>% 
  ggplot(aes(is_canceled)) + 
  geom_bar()

cancellation vs lead time

# Visualize cancellations across lead time
data_clean %>% 
  ggplot(aes(is_canceled, lead_time)) + 
  geom_boxplot()

correlation plot

# Step 1: Binarize the data
data_binarized <- data_clean %>%
  drop_na() %>%
  select(-reservation_status_date) %>%
  binarize()

data_binarized %>% glimpse()

## Rows: 119,386
## Columns: 99
## $ hotel__City_Hotel                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ hotel__Resort_Hotel                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_canceled__Canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ is_canceled__Not_Canceled                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ `lead_time__-Inf_18`                     <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ lead_time__18_69                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ lead_time__69_160                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ lead_time__160_Inf                       <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2015                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_year__2016                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2017                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__April                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__August               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__December             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__February             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__January              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__July                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_month__June                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__March                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__May                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__November             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__October              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__September            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_week_number__-Inf_16`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__16_28          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_week_number__28_38          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__38_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_day_of_month__-Inf_8`      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_day_of_month__8_16          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__16_23         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__23_Inf        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_weekend_nights__-Inf_1`        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stays_in_weekend_nights__1_2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ stays_in_weekend_nights__2_Inf           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_week_nights__-Inf_1`           <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,…
## $ stays_in_week_nights__1_2                <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,…
## $ stays_in_week_nights__2_3                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ stays_in_week_nights__3_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adults__-Inf_2`                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ adults__2_Inf                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__0                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ children__1                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__2                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `children__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ babies__0                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `babies__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ meal__BB                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ meal__HB                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ meal__SC                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `meal__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ market_segment__Corporate                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ market_segment__Direct                   <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ market_segment__Groups                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `market_segment__Offline_TA/TO`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ market_segment__Online_TA                <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,…
## $ `market_segment__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Corporate          <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Direct             <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ `distribution_channel__TA/TO`            <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,…
## $ `distribution_channel__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ is_repeated_guest__0                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_repeated_guest__1                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_cancellations__0                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_cancellations__1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_cancellations__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_bookings_not_canceled__0        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_bookings_not_canceled__1        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_bookings_not_canceled__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__A                    <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ reserved_room_type__D                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ reserved_room_type__E                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__F                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__G                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `reserved_room_type__-OTHER`             <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ booking_changes__0                       <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ booking_changes__1                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ booking_changes__2                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `booking_changes__-OTHER`                <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ deposit_type__No_Deposit                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ deposit_type__Non_Refund                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `deposit_type__-OTHER`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ days_in_waiting_list__0                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `days_in_waiting_list__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Contract                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Transient                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `customer_type__Transient-Party`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `customer_type__-OTHER`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adr__-Inf_69.29`                        <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ adr__69.29_94.59                         <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,…
## $ adr__94.59_126                           <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ adr__126_Inf                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ required_car_parking_spaces__0           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ required_car_parking_spaces__1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `required_car_parking_spaces__-OTHER`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__0             <dbl> 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,…
## $ total_of_special_requests__1             <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,…
## $ total_of_special_requests__2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__3             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `total_of_special_requests__-OTHER`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

# Step 2: Correlation analysis against the target variable
data_correlation <- data_binarized %>%
  correlate(target = is_canceled__Canceled)

data_correlation

## # A tibble: 99 × 3
##    feature                   bin          correlation
##    <fct>                     <chr>              <dbl>
##  1 is_canceled               Canceled           1    
##  2 is_canceled               Not_Canceled      -1    
##  3 deposit_type              Non_Refund         0.481
##  4 deposit_type              No_Deposit        -0.478
##  5 previous_cancellations    1                  0.275
##  6 previous_cancellations    0                 -0.271
##  7 lead_time                 -Inf_18           -0.270
##  8 total_of_special_requests 0                  0.265
##  9 market_segment            Groups             0.222
## 10 lead_time                 160_Inf            0.220
## # ℹ 89 more rows

#step 3:plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

#split data

set.seed(1234)

data_clean <- data_clean %>%
  drop_na() %>%
  sample_n(3000)

data_split <- initial_split(data_clean, strata = is_canceled)
data_train <- training(data_split)
data_test  <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, v = 3, strata = is_canceled)
data_cv

## #  3-fold cross-validation using stratification 
## # A tibble: 3 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [1500/750]> Fold1
## 2 <split [1500/750]> Fold2
## 3 <split [1500/750]> Fold3

Preprocess data

xgboost_rec <- recipes::recipe(is_canceled ~ ., data = data_train) %>%
  update_role(reservation_status_date, new_role = "ID") %>%
  step_dummy(all_nominal_predictors()) %>%
  step_smote(is_canceled)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 2,856
## Columns: 60
## $ lead_time                      <dbl> 176, 119, 175, 103, 68, 10, 164, 29, 67…
## $ arrival_date_year              <dbl> 2016, 2016, 2017, 2016, 2016, 2016, 201…
## $ arrival_date_week_number       <dbl> 15, 45, 20, 17, 18, 15, 20, 21, 43, 43,…
## $ arrival_date_day_of_month      <dbl> 9, 4, 19, 19, 24, 8, 15, 18, 17, 20, 1,…
## $ stays_in_weekend_nights        <dbl> 1, 0, 0, 0, 2, 0, 1, 0, 1, 0, 1, 2, 0, …
## $ stays_in_week_nights           <dbl> 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 0, 1, …
## $ adults                         <dbl> 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, …
## $ children                       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ booking_changes                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ adr                            <dbl> 125.00, 79.20, 90.00, 115.00, 79.56, 13…
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <dbl> 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, …
## $ reservation_status_date        <date> 2015-11-23, 2016-11-04, 2016-11-25, 20…
## $ is_canceled                    <fct> Canceled, Canceled, Canceled, Canceled,…
## $ hotel_Resort.Hotel             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ arrival_date_month_August      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ arrival_date_month_December    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_February    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_January     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_July        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_June        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_March       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_May         <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, …
## $ arrival_date_month_November    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ arrival_date_month_October     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, …
## $ arrival_date_month_September   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal_FB                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal_HB                        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ meal_SC                        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ meal_Undefined                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Complementary   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Corporate       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Direct          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Groups          <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ market_segment_Offline.TA.TO   <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ market_segment_Online.TA       <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ market_segment_Undefined       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ distribution_channel_Direct    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ distribution_channel_GDS       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ distribution_channel_TA.TO     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ distribution_channel_Undefined <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ is_repeated_guest_X1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_B           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_C           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_D           <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ reserved_room_type_E           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ reserved_room_type_F           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_G           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_H           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_L           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type_P           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type_Non.Refund        <dbl> 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ deposit_type_Refundable        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type_Group            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type_Transient        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ customer_type_Transient.Party  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

#Specify model

xgboost_spec <- 
  boost_tree(
    trees = tune(),
    min_n = tune(),
    tree_depth = tune(),
    learn_rate = tune(),
    loss_reduction = tune(),
    sample_size = tune()
  ) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

##tune hyperparameters

doParallel::registerDoParallel()

set.seed(65743)
xgboost_tune <-
  tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 2,
    control = control_grid(save_pred = TRUE)
  )

#Model evaluation

collect_metrics(xgboost_tune)

## # A tibble: 6 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric    
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>      
## 1     1    40          1      0.316  31.6                  1   accuracy   
## 2     1    40          1      0.316  31.6                  1   brier_class
## 3     1    40          1      0.316  31.6                  1   roc_auc    
## 4  2000     2         15      0.001   0.0000000001         0.5 accuracy   
## 5  2000     2         15      0.001   0.0000000001         0.5 brier_class
## 6  2000     2         15      0.001   0.0000000001         0.5 roc_auc    
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(is_canceled, .pred_Canceled) %>%
    autoplot()

#Fit model for last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config        
##   <chr>       <chr>          <dbl> <chr>          
## 1 accuracy    binary         0.821 pre0_mod0_post0
## 2 roc_auc     binary         0.877 pre0_mod0_post0
## 3 brier_class binary         0.132 pre0_mod0_post0

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(is_canceled, .pred_class) %>%
    autoplot()

#variable importance

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Apply 7

Matthew Plutzner

2026-04-14