data <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-11/hotels.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##Explore data

skimr::skim(data)
Data summary
Name data
Number of rows 119390
Number of columns 32
_______________________
Column type frequency:
character 13
Date 1
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hotel 0 1 10 12 0 2 0
arrival_date_month 0 1 3 9 0 12 0
meal 0 1 2 9 0 5 0
country 0 1 2 4 0 178 0
market_segment 0 1 6 13 0 8 0
distribution_channel 0 1 3 9 0 5 0
reserved_room_type 0 1 1 1 0 10 0
assigned_room_type 0 1 1 1 0 12 0
deposit_type 0 1 10 10 0 3 0
agent 0 1 1 4 0 334 0
company 0 1 1 4 0 353 0
customer_type 0 1 5 15 0 4 0
reservation_status 0 1 7 9 0 3 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
is_canceled 0 1 0.37 0.48 0.00 0.00 0.00 1 1 ▇▁▁▁▅
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737 ▇▂▁▁▁
arrival_date_year 0 1 2016.16 0.71 2015.00 2016.00 2016.00 2017 2017 ▃▁▇▁▆
arrival_date_week_number 0 1 27.17 13.61 1.00 16.00 28.00 38 53 ▅▇▇▇▅
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31 ▇▇▇▇▆
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19 ▇▁▁▁▁
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50 ▇▁▁▁▁
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55 ▇▁▁▁▁
children 4 1 0.10 0.40 0.00 0.00 0.00 0 10 ▇▁▁▁▁
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10 ▇▁▁▁▁
is_repeated_guest 0 1 0.03 0.18 0.00 0.00 0.00 0 1 ▇▁▁▁▁
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26 ▇▁▁▁▁
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72 ▇▁▁▁▁
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21 ▇▁▁▁▁
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391 ▇▁▁▁▁
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400 ▇▁▁▁▁
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8 ▇▁▁▁▁
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5 ▇▁▁▁▁
# Create a vector of column names to convert to factors
factors_vec <- data %>% 
  select(hotel, arrival_date_month, meal, market_segment,
         distribution_channel, reserved_room_type, deposit_type,
         customer_type, is_repeated_guest, is_canceled) %>% 
  names()

# Clean the data
data_clean <- data %>%
  
  # Address factors imported as character/numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop only the specific problem variables
  select(-company, -agent, -reservation_status, -country, -assigned_room_type) %>%
  
  # Rename target levels to match code along style
  mutate(is_canceled = if_else(is_canceled == 1, "Canceled", "Not_Canceled"),
         is_canceled = as.factor(is_canceled))

#explore data

# Explore data
data_clean %>% count(is_canceled)
## # A tibble: 2 × 2
##   is_canceled      n
##   <fct>        <int>
## 1 Canceled     44224
## 2 Not_Canceled 75166
data_clean %>% 
  ggplot(aes(is_canceled)) + 
  geom_bar()

cancellation vs lead time

# Visualize cancellations across lead time
data_clean %>% 
  ggplot(aes(is_canceled, lead_time)) + 
  geom_boxplot()

correlation plot

# Step 1: Binarize the data
data_binarized <- data_clean %>%
  drop_na() %>%
  select(-reservation_status_date) %>%
  binarize()

data_binarized %>% glimpse()
## Rows: 119,386
## Columns: 99
## $ hotel__City_Hotel                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ hotel__Resort_Hotel                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_canceled__Canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ is_canceled__Not_Canceled                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ `lead_time__-Inf_18`                     <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ lead_time__18_69                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ lead_time__69_160                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ lead_time__160_Inf                       <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2015                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_year__2016                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2017                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__April                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__August               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__December             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__February             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__January              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__July                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_month__June                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__March                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__May                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__November             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__October              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__September            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_week_number__-Inf_16`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__16_28          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_week_number__28_38          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__38_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_day_of_month__-Inf_8`      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_day_of_month__8_16          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__16_23         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__23_Inf        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_weekend_nights__-Inf_1`        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stays_in_weekend_nights__1_2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ stays_in_weekend_nights__2_Inf           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_week_nights__-Inf_1`           <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,…
## $ stays_in_week_nights__1_2                <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,…
## $ stays_in_week_nights__2_3                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ stays_in_week_nights__3_Inf              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adults__-Inf_2`                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ adults__2_Inf                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__0                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ children__1                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__2                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `children__-OTHER`                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ babies__0                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `babies__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ meal__BB                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ meal__HB                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ meal__SC                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `meal__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ market_segment__Corporate                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ market_segment__Direct                   <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ market_segment__Groups                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `market_segment__Offline_TA/TO`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ market_segment__Online_TA                <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,…
## $ `market_segment__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Corporate          <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Direct             <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ `distribution_channel__TA/TO`            <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,…
## $ `distribution_channel__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ is_repeated_guest__0                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_repeated_guest__1                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_cancellations__0                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_cancellations__1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_cancellations__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_bookings_not_canceled__0        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_bookings_not_canceled__1        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_bookings_not_canceled__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__A                    <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ reserved_room_type__D                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ reserved_room_type__E                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__F                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__G                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `reserved_room_type__-OTHER`             <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ booking_changes__0                       <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ booking_changes__1                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ booking_changes__2                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `booking_changes__-OTHER`                <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ deposit_type__No_Deposit                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ deposit_type__Non_Refund                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `deposit_type__-OTHER`                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ days_in_waiting_list__0                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `days_in_waiting_list__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Contract                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Transient                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `customer_type__Transient-Party`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `customer_type__-OTHER`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adr__-Inf_69.29`                        <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ adr__69.29_94.59                         <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,…
## $ adr__94.59_126                           <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ adr__126_Inf                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ required_car_parking_spaces__0           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ required_car_parking_spaces__1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `required_car_parking_spaces__-OTHER`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__0             <dbl> 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,…
## $ total_of_special_requests__1             <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,…
## $ total_of_special_requests__2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__3             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `total_of_special_requests__-OTHER`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Step 2: Correlation analysis against the target variable
data_correlation <- data_binarized %>%
  correlate(target = is_canceled__Canceled)

data_correlation
## # A tibble: 99 × 3
##    feature                   bin          correlation
##    <fct>                     <chr>              <dbl>
##  1 is_canceled               Canceled           1    
##  2 is_canceled               Not_Canceled      -1    
##  3 deposit_type              Non_Refund         0.481
##  4 deposit_type              No_Deposit        -0.478
##  5 previous_cancellations    1                  0.275
##  6 previous_cancellations    0                 -0.271
##  7 lead_time                 -Inf_18           -0.270
##  8 total_of_special_requests 0                  0.265
##  9 market_segment            Groups             0.222
## 10 lead_time                 160_Inf            0.220
## # ℹ 89 more rows
#step 3:plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

#split data

set.seed(1234)

data_clean <- data_clean %>%
  drop_na() %>%
  sample_n(3000)

data_split <- initial_split(data_clean, strata = is_canceled)
data_train <- training(data_split)
data_test  <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, v = 3, strata = is_canceled)
data_cv
## #  3-fold cross-validation using stratification 
## # A tibble: 3 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [1500/750]> Fold1
## 2 <split [1500/750]> Fold2
## 3 <split [1500/750]> Fold3

#Preprocess data

xgboost_rec <- recipes::recipe(is_canceled ~ ., data = data_train) %>%
  step_date(reservation_status_date, features = c("year", "month", "dow")) %>%
  step_rm(reservation_status_date) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_pca(all_numeric_predictors(), threshold = 0.99)

xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Warning: !  The following columns have zero variance so scaling cannot be used:
##   market_segment_Undefined, distribution_channel_Undefined,
##   reserved_room_type_L, and reserved_room_type_P.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##   before normalizing.
## Rows: 2,250
## Columns: 63
## $ is_canceled <fct> Canceled, Canceled, Canceled, Canceled, Canceled, Canceled…
## $ PC01        <dbl> -1.9194210, -0.1254247, -0.9212166, -0.4488066, 0.2143022,…
## $ PC02        <dbl> 0.9688114, 0.5445966, 0.9407378, 0.9669918, 0.4601126, 1.1…
## $ PC03        <dbl> 0.3269090, -0.7530889, 2.1848271, 1.9602386, 1.1607782, -0…
## $ PC04        <dbl> 0.18919929, 4.49291445, 1.24741204, 0.44264502, 0.14534746…
## $ PC05        <dbl> 1.35817302, -0.50659173, 3.23475188, 1.41732053, -0.036467…
## $ PC06        <dbl> -1.7586495, 0.3036193, -0.6590210, -0.3890884, -0.9655685,…
## $ PC07        <dbl> 0.588546204, 1.729328349, 1.059147914, -0.652387064, -0.73…
## $ PC08        <dbl> -0.50991922, -0.54620735, -0.03949499, -0.87363653, -0.541…
## $ PC09        <dbl> 0.138727827, -0.838240169, -1.641032450, 1.452945293, 0.41…
## $ PC10        <dbl> -0.626528565, -1.433083626, -1.042241809, 0.188199201, 0.1…
## $ PC11        <dbl> 0.22436435, -0.42187141, 0.61954116, 0.40390192, 0.0609452…
## $ PC12        <dbl> 0.23660241, 1.81889242, 0.40090135, -0.74869201, -1.034950…
## $ PC13        <dbl> -2.1542008, -3.1451840, -2.0185194, -1.4535937, -1.1238439…
## $ PC14        <dbl> 0.215312920, -1.066899788, 0.826057379, -0.008613257, -0.6…
## $ PC15        <dbl> 0.52260468, -0.46866366, 0.06573072, 0.99447298, 2.0879059…
## $ PC16        <dbl> 1.64690227, -0.39661009, -0.06908672, 0.26111862, 0.220708…
## $ PC17        <dbl> -0.47699166, 1.21887738, 0.49908099, -1.27156121, -1.70089…
## $ PC18        <dbl> -1.71296288, -0.94585970, -0.43475650, -1.16716655, -0.352…
## $ PC19        <dbl> -0.31933885, -0.57767696, -0.86190985, -0.28531635, 1.4500…
## $ PC20        <dbl> 0.1692440, -0.1311913, 1.3383605, -0.5122346, 0.0430493, -…
## $ PC21        <dbl> 0.02515021, -0.10264720, 0.02089192, -0.34821817, 1.356207…
## $ PC22        <dbl> -0.507859548, -0.775090995, -0.107947455, 1.092585346, 0.0…
## $ PC23        <dbl> 1.66383728, -1.23346984, -1.15733214, 1.68355189, 0.516724…
## $ PC24        <dbl> 0.58329078, -0.61702671, -1.02295314, -0.40908951, 0.92100…
## $ PC25        <dbl> 1.33837307, -0.67582530, -0.06402532, -0.93783611, -0.0913…
## $ PC26        <dbl> 0.12464590, 1.73619903, 0.38191045, -0.86893066, -0.856271…
## $ PC27        <dbl> 0.07590962, 0.71005872, 0.15462369, -0.18661648, 0.7581155…
## $ PC28        <dbl> 0.8855875, -0.6796415, -1.0571121, -1.0414294, -0.3412472,…
## $ PC29        <dbl> 0.86067662, 0.11202209, -0.36911862, 0.38797016, -1.212295…
## $ PC30        <dbl> -0.43411889, 0.72141762, -0.03675400, -0.24687637, 0.64020…
## $ PC31        <dbl> -0.27227229, -0.05009608, 0.27503320, 0.64195938, -0.09489…
## $ PC32        <dbl> 0.88765838, 1.22654688, 0.26190835, -0.52613561, -0.585281…
## $ PC33        <dbl> -0.1815947, 0.7408401, -0.3699431, -0.6278284, -0.1973728,…
## $ PC34        <dbl> -0.56753914, -0.97664330, 0.39708608, 0.64592060, 0.487949…
## $ PC35        <dbl> -0.44159542, -0.06747213, -0.18401219, -0.18403338, -0.218…
## $ PC36        <dbl> 0.6208934, 1.1059554, 0.9439245, -0.5685238, -0.5764942, -…
## $ PC37        <dbl> -0.69957345, 1.10906503, -0.87946335, -0.14834323, 0.53997…
## $ PC38        <dbl> 1.12057267, 1.28476564, 0.44252444, 0.35699043, 0.47804099…
## $ PC39        <dbl> -0.7479259, -0.9681437, -0.3693314, -0.2206632, 0.2427862,…
## $ PC40        <dbl> 0.5229712, -0.1199517, -0.4835952, -0.4741853, -0.4285300,…
## $ PC41        <dbl> 1.14149130, 0.21855227, 0.41318937, 0.12373287, -0.3130451…
## $ PC42        <dbl> 0.57533658, 0.30778611, -0.54621204, -0.78401124, -1.26178…
## $ PC43        <dbl> -1.28635336, 0.52669680, -0.32759978, -0.71496318, -0.3791…
## $ PC44        <dbl> 0.2561775, 0.6729908, -0.1738093, -0.3570633, 0.1212171, -…
## $ PC45        <dbl> 0.85225508, -0.02795243, -0.23193266, 0.14144910, 0.196743…
## $ PC46        <dbl> -1.11237020, -0.38125206, -0.26954753, -0.32090246, 0.2303…
## $ PC47        <dbl> -0.00902313, 0.04741104, -0.29706252, 0.31522767, 0.788628…
## $ PC48        <dbl> -1.47584035, -0.71158883, -1.45934411, -1.07873424, -0.830…
## $ PC49        <dbl> 0.40984400, -1.33178062, -0.09605598, 0.30938402, 0.372201…
## $ PC50        <dbl> -0.57390759, -0.08459423, -0.94028183, 0.59784893, 0.20823…
## $ PC51        <dbl> 1.063442470, -0.405748078, 2.078425158, -1.025744948, -0.9…
## $ PC52        <dbl> -0.044439906, 0.114998513, -0.464920093, 0.322877717, 0.26…
## $ PC53        <dbl> 0.87313900, -0.11243619, -0.66238661, -0.54958418, -0.9967…
## $ PC54        <dbl> -0.442710729, 0.046309654, -0.102743134, 0.356350472, 0.62…
## $ PC55        <dbl> 0.14954802, -0.25820110, 0.82154642, 0.58510570, -1.104198…
## $ PC56        <dbl> -1.06952013, 0.03078531, -1.43386699, 0.21098818, 0.920213…
## $ PC57        <dbl> 0.29288489, 0.26826192, -0.63992818, 1.00094670, -0.979938…
## $ PC58        <dbl> 1.364762383, 0.025844325, 0.511943078, -1.008457091, -0.50…
## $ PC59        <dbl> 0.397231878, 0.299711257, -0.461322121, -0.212515415, -0.5…
## $ PC60        <dbl> -0.839812887, 0.176180642, -0.274033209, -0.837794120, -1.…
## $ PC61        <dbl> -0.433471168, 0.150933565, -0.952233833, 0.053588886, -0.2…
## $ PC62        <dbl> -0.12760793, 0.18479726, 0.20301537, 0.22455532, 1.0908803…

#Specify model

xgboost_spec <- 
  boost_tree(
    trees = tune(),
    min_n = tune(),
    tree_depth = tune(),
    learn_rate = tune(),
    loss_reduction = tune(),
    sample_size = tune()
  ) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec) 

##tune hyperparameters

set.seed(65743)
xgboost_tune <-
  tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 2,
    control = control_grid(save_pred = TRUE)
  )
## → A | warning: !  The following columns have zero variance so scaling cannot be used:
##                  market_segment_Undefined, distribution_channel_Undefined,
##                  reserved_room_type_L, reserved_room_type_P, and deposit_type_Refundable.
##                ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##                  before normalizing.
## There were issues with some computations   A: x1                                                 → B | warning: !  The following columns have zero variance so scaling cannot be used:
##                  market_segment_Undefined, distribution_channel_GDS,
##                  distribution_channel_Undefined, reserved_room_type_L, and
##                  reserved_room_type_P.
##                ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##                  before normalizing.
## There were issues with some computations   A: x1There were issues with some computations   A: x1   B: x1                                                         → C | warning: !  The following columns have zero variance so scaling cannot be used:
##                  market_segment_Undefined, distribution_channel_Undefined,
##                  reserved_room_type_L, and reserved_room_type_P.
##                ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##                  before normalizing.
## There were issues with some computations   A: x1   B: x1There were issues with some computations   A: x1   B: x1   C: x1There were issues with some computations   A: x1   B: x1   C: x1

#Model evaluation

collect_metrics(xgboost_tune)
## # A tibble: 6 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric    
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>      
## 1     1    40          1      0.316  31.6                  1   accuracy   
## 2     1    40          1      0.316  31.6                  1   brier_class
## 3     1    40          1      0.316  31.6                  1   roc_auc    
## 4  2000     2         15      0.001   0.0000000001         0.5 accuracy   
## 5  2000     2         15      0.001   0.0000000001         0.5 brier_class
## 6  2000     2         15      0.001   0.0000000001         0.5 roc_auc    
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(is_canceled, .pred_Canceled) %>%
    autoplot()

#Fit model for last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)
## → A | warning: !  The following columns have zero variance so scaling cannot be used:
##                  market_segment_Undefined, distribution_channel_Undefined,
##                  reserved_room_type_L, and reserved_room_type_P.
##                ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
##                  before normalizing.
## There were issues with some computations   A: x1There were issues with some computations   A: x1
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config        
##   <chr>       <chr>          <dbl> <chr>          
## 1 accuracy    binary        0.908  pre0_mod0_post0
## 2 roc_auc     binary        0.929  pre0_mod0_post0
## 3 brier_class binary        0.0917 pre0_mod0_post0
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(is_canceled, .pred_class) %>%
    autoplot()

#variable importance

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()