data <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-11/hotels.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date (1): reservation_status_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##Explore data
skimr::skim(data)
| Name | data |
| Number of rows | 119390 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 13 |
| Date | 1 |
| numeric | 18 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| hotel | 0 | 1 | 10 | 12 | 0 | 2 | 0 |
| arrival_date_month | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
| meal | 0 | 1 | 2 | 9 | 0 | 5 | 0 |
| country | 0 | 1 | 2 | 4 | 0 | 178 | 0 |
| market_segment | 0 | 1 | 6 | 13 | 0 | 8 | 0 |
| distribution_channel | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
| reserved_room_type | 0 | 1 | 1 | 1 | 0 | 10 | 0 |
| assigned_room_type | 0 | 1 | 1 | 1 | 0 | 12 | 0 |
| deposit_type | 0 | 1 | 10 | 10 | 0 | 3 | 0 |
| agent | 0 | 1 | 1 | 4 | 0 | 334 | 0 |
| company | 0 | 1 | 1 | 4 | 0 | 353 | 0 |
| customer_type | 0 | 1 | 5 | 15 | 0 | 4 | 0 |
| reservation_status | 0 | 1 | 7 | 9 | 0 | 3 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| reservation_status_date | 0 | 1 | 2014-10-17 | 2017-09-14 | 2016-08-07 | 926 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| is_canceled | 0 | 1 | 0.37 | 0.48 | 0.00 | 0.00 | 0.00 | 1 | 1 | ▇▁▁▁▅ |
| lead_time | 0 | 1 | 104.01 | 106.86 | 0.00 | 18.00 | 69.00 | 160 | 737 | ▇▂▁▁▁ |
| arrival_date_year | 0 | 1 | 2016.16 | 0.71 | 2015.00 | 2016.00 | 2016.00 | 2017 | 2017 | ▃▁▇▁▆ |
| arrival_date_week_number | 0 | 1 | 27.17 | 13.61 | 1.00 | 16.00 | 28.00 | 38 | 53 | ▅▇▇▇▅ |
| arrival_date_day_of_month | 0 | 1 | 15.80 | 8.78 | 1.00 | 8.00 | 16.00 | 23 | 31 | ▇▇▇▇▆ |
| stays_in_weekend_nights | 0 | 1 | 0.93 | 1.00 | 0.00 | 0.00 | 1.00 | 2 | 19 | ▇▁▁▁▁ |
| stays_in_week_nights | 0 | 1 | 2.50 | 1.91 | 0.00 | 1.00 | 2.00 | 3 | 50 | ▇▁▁▁▁ |
| adults | 0 | 1 | 1.86 | 0.58 | 0.00 | 2.00 | 2.00 | 2 | 55 | ▇▁▁▁▁ |
| children | 4 | 1 | 0.10 | 0.40 | 0.00 | 0.00 | 0.00 | 0 | 10 | ▇▁▁▁▁ |
| babies | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0 | 10 | ▇▁▁▁▁ |
| is_repeated_guest | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.00 | 0.00 | 0 | 1 | ▇▁▁▁▁ |
| previous_cancellations | 0 | 1 | 0.09 | 0.84 | 0.00 | 0.00 | 0.00 | 0 | 26 | ▇▁▁▁▁ |
| previous_bookings_not_canceled | 0 | 1 | 0.14 | 1.50 | 0.00 | 0.00 | 0.00 | 0 | 72 | ▇▁▁▁▁ |
| booking_changes | 0 | 1 | 0.22 | 0.65 | 0.00 | 0.00 | 0.00 | 0 | 21 | ▇▁▁▁▁ |
| days_in_waiting_list | 0 | 1 | 2.32 | 17.59 | 0.00 | 0.00 | 0.00 | 0 | 391 | ▇▁▁▁▁ |
| adr | 0 | 1 | 101.83 | 50.54 | -6.38 | 69.29 | 94.58 | 126 | 5400 | ▇▁▁▁▁ |
| required_car_parking_spaces | 0 | 1 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0 | 8 | ▇▁▁▁▁ |
| total_of_special_requests | 0 | 1 | 0.57 | 0.79 | 0.00 | 0.00 | 0.00 | 1 | 5 | ▇▁▁▁▁ |
# Create a vector of column names to convert to factors
factors_vec <- data %>%
select(hotel, arrival_date_month, meal, market_segment,
distribution_channel, reserved_room_type, deposit_type,
customer_type, is_repeated_guest, is_canceled) %>%
names()
# Clean the data
data_clean <- data %>%
# Address factors imported as character/numeric
mutate(across(all_of(factors_vec), as.factor)) %>%
# Drop only the specific problem variables
select(-company, -agent, -reservation_status, -country, -assigned_room_type) %>%
# Rename target levels to match code along style
mutate(is_canceled = if_else(is_canceled == 1, "Canceled", "Not_Canceled"),
is_canceled = as.factor(is_canceled))
#explore data
# Explore data
data_clean %>% count(is_canceled)
## # A tibble: 2 × 2
## is_canceled n
## <fct> <int>
## 1 Canceled 44224
## 2 Not_Canceled 75166
data_clean %>%
ggplot(aes(is_canceled)) +
geom_bar()
cancellation vs lead time
# Visualize cancellations across lead time
data_clean %>%
ggplot(aes(is_canceled, lead_time)) +
geom_boxplot()
correlation plot
# Step 1: Binarize the data
data_binarized <- data_clean %>%
drop_na() %>%
select(-reservation_status_date) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 119,386
## Columns: 99
## $ hotel__City_Hotel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ hotel__Resort_Hotel <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_canceled__Canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ is_canceled__Not_Canceled <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ `lead_time__-Inf_18` <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,…
## $ lead_time__18_69 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ lead_time__69_160 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ lead_time__160_Inf <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2015 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_year__2016 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_year__2017 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__April <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__August <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__December <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__February <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__January <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__July <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_month__June <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__March <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__May <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__November <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__October <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_month__September <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_week_number__-Inf_16` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__16_28 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_week_number__28_38 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_week_number__38_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `arrival_date_day_of_month__-Inf_8` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ arrival_date_day_of_month__8_16 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__16_23 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ arrival_date_day_of_month__23_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_weekend_nights__-Inf_1` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ stays_in_weekend_nights__1_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ stays_in_weekend_nights__2_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `stays_in_week_nights__-Inf_1` <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,…
## $ stays_in_week_nights__1_2 <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,…
## $ stays_in_week_nights__2_3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ stays_in_week_nights__3_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adults__-Inf_2` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ adults__2_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ children__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ children__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `children__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ babies__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `babies__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ meal__BB <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,…
## $ meal__HB <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ meal__SC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `meal__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ market_segment__Corporate <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ market_segment__Direct <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ market_segment__Groups <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `market_segment__Offline_TA/TO` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ market_segment__Online_TA <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,…
## $ `market_segment__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Corporate <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ distribution_channel__Direct <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,…
## $ `distribution_channel__TA/TO` <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,…
## $ `distribution_channel__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ is_repeated_guest__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ is_repeated_guest__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_cancellations__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_cancellations__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_cancellations__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ previous_bookings_not_canceled__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ previous_bookings_not_canceled__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `previous_bookings_not_canceled__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__A <dbl> 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ reserved_room_type__D <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ reserved_room_type__E <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__F <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ reserved_room_type__G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `reserved_room_type__-OTHER` <dbl> 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ booking_changes__0 <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ booking_changes__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ booking_changes__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `booking_changes__-OTHER` <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ deposit_type__No_Deposit <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ deposit_type__Non_Refund <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `deposit_type__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ days_in_waiting_list__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `days_in_waiting_list__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Contract <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ customer_type__Transient <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `customer_type__Transient-Party` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `customer_type__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `adr__-Inf_69.29` <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ adr__69.29_94.59 <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,…
## $ adr__94.59_126 <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ adr__126_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ required_car_parking_spaces__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ required_car_parking_spaces__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `required_car_parking_spaces__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__0 <dbl> 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,…
## $ total_of_special_requests__1 <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,…
## $ total_of_special_requests__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ total_of_special_requests__3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `total_of_special_requests__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Step 2: Correlation analysis against the target variable
data_correlation <- data_binarized %>%
correlate(target = is_canceled__Canceled)
data_correlation
## # A tibble: 99 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 is_canceled Canceled 1
## 2 is_canceled Not_Canceled -1
## 3 deposit_type Non_Refund 0.481
## 4 deposit_type No_Deposit -0.478
## 5 previous_cancellations 1 0.275
## 6 previous_cancellations 0 -0.271
## 7 lead_time -Inf_18 -0.270
## 8 total_of_special_requests 0 0.265
## 9 market_segment Groups 0.222
## 10 lead_time 160_Inf 0.220
## # ℹ 89 more rows
#step 3:plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()
#split data
set.seed(1234)
data_clean <- data_clean %>%
drop_na() %>%
sample_n(3000)
data_split <- initial_split(data_clean, strata = is_canceled)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <- rsample::vfold_cv(data_train, v = 3, strata = is_canceled)
data_cv
## # 3-fold cross-validation using stratification
## # A tibble: 3 × 2
## splits id
## <list> <chr>
## 1 <split [1500/750]> Fold1
## 2 <split [1500/750]> Fold2
## 3 <split [1500/750]> Fold3
#Preprocess data
xgboost_rec <- recipes::recipe(is_canceled ~ ., data = data_train) %>%
step_date(reservation_status_date, features = c("year", "month", "dow")) %>%
step_rm(reservation_status_date) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_pca(all_numeric_predictors(), threshold = 0.99)
xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Warning: ! The following columns have zero variance so scaling cannot be used:
## market_segment_Undefined, distribution_channel_Undefined,
## reserved_room_type_L, and reserved_room_type_P.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
## Rows: 2,250
## Columns: 63
## $ is_canceled <fct> Canceled, Canceled, Canceled, Canceled, Canceled, Canceled…
## $ PC01 <dbl> -1.9194210, -0.1254247, -0.9212166, -0.4488066, 0.2143022,…
## $ PC02 <dbl> 0.9688114, 0.5445966, 0.9407378, 0.9669918, 0.4601126, 1.1…
## $ PC03 <dbl> 0.3269090, -0.7530889, 2.1848271, 1.9602386, 1.1607782, -0…
## $ PC04 <dbl> 0.18919929, 4.49291445, 1.24741204, 0.44264502, 0.14534746…
## $ PC05 <dbl> 1.35817302, -0.50659173, 3.23475188, 1.41732053, -0.036467…
## $ PC06 <dbl> -1.7586495, 0.3036193, -0.6590210, -0.3890884, -0.9655685,…
## $ PC07 <dbl> 0.588546204, 1.729328349, 1.059147914, -0.652387064, -0.73…
## $ PC08 <dbl> -0.50991922, -0.54620735, -0.03949499, -0.87363653, -0.541…
## $ PC09 <dbl> 0.138727827, -0.838240169, -1.641032450, 1.452945293, 0.41…
## $ PC10 <dbl> -0.626528565, -1.433083626, -1.042241809, 0.188199201, 0.1…
## $ PC11 <dbl> 0.22436435, -0.42187141, 0.61954116, 0.40390192, 0.0609452…
## $ PC12 <dbl> 0.23660241, 1.81889242, 0.40090135, -0.74869201, -1.034950…
## $ PC13 <dbl> -2.1542008, -3.1451840, -2.0185194, -1.4535937, -1.1238439…
## $ PC14 <dbl> 0.215312920, -1.066899788, 0.826057379, -0.008613257, -0.6…
## $ PC15 <dbl> 0.52260468, -0.46866366, 0.06573072, 0.99447298, 2.0879059…
## $ PC16 <dbl> 1.64690227, -0.39661009, -0.06908672, 0.26111862, 0.220708…
## $ PC17 <dbl> -0.47699166, 1.21887738, 0.49908099, -1.27156121, -1.70089…
## $ PC18 <dbl> -1.71296288, -0.94585970, -0.43475650, -1.16716655, -0.352…
## $ PC19 <dbl> -0.31933885, -0.57767696, -0.86190985, -0.28531635, 1.4500…
## $ PC20 <dbl> 0.1692440, -0.1311913, 1.3383605, -0.5122346, 0.0430493, -…
## $ PC21 <dbl> 0.02515021, -0.10264720, 0.02089192, -0.34821817, 1.356207…
## $ PC22 <dbl> -0.507859548, -0.775090995, -0.107947455, 1.092585346, 0.0…
## $ PC23 <dbl> 1.66383728, -1.23346984, -1.15733214, 1.68355189, 0.516724…
## $ PC24 <dbl> 0.58329078, -0.61702671, -1.02295314, -0.40908951, 0.92100…
## $ PC25 <dbl> 1.33837307, -0.67582530, -0.06402532, -0.93783611, -0.0913…
## $ PC26 <dbl> 0.12464590, 1.73619903, 0.38191045, -0.86893066, -0.856271…
## $ PC27 <dbl> 0.07590962, 0.71005872, 0.15462369, -0.18661648, 0.7581155…
## $ PC28 <dbl> 0.8855875, -0.6796415, -1.0571121, -1.0414294, -0.3412472,…
## $ PC29 <dbl> 0.86067662, 0.11202209, -0.36911862, 0.38797016, -1.212295…
## $ PC30 <dbl> -0.43411889, 0.72141762, -0.03675400, -0.24687637, 0.64020…
## $ PC31 <dbl> -0.27227229, -0.05009608, 0.27503320, 0.64195938, -0.09489…
## $ PC32 <dbl> 0.88765838, 1.22654688, 0.26190835, -0.52613561, -0.585281…
## $ PC33 <dbl> -0.1815947, 0.7408401, -0.3699431, -0.6278284, -0.1973728,…
## $ PC34 <dbl> -0.56753914, -0.97664330, 0.39708608, 0.64592060, 0.487949…
## $ PC35 <dbl> -0.44159542, -0.06747213, -0.18401219, -0.18403338, -0.218…
## $ PC36 <dbl> 0.6208934, 1.1059554, 0.9439245, -0.5685238, -0.5764942, -…
## $ PC37 <dbl> -0.69957345, 1.10906503, -0.87946335, -0.14834323, 0.53997…
## $ PC38 <dbl> 1.12057267, 1.28476564, 0.44252444, 0.35699043, 0.47804099…
## $ PC39 <dbl> -0.7479259, -0.9681437, -0.3693314, -0.2206632, 0.2427862,…
## $ PC40 <dbl> 0.5229712, -0.1199517, -0.4835952, -0.4741853, -0.4285300,…
## $ PC41 <dbl> 1.14149130, 0.21855227, 0.41318937, 0.12373287, -0.3130451…
## $ PC42 <dbl> 0.57533658, 0.30778611, -0.54621204, -0.78401124, -1.26178…
## $ PC43 <dbl> -1.28635336, 0.52669680, -0.32759978, -0.71496318, -0.3791…
## $ PC44 <dbl> 0.2561775, 0.6729908, -0.1738093, -0.3570633, 0.1212171, -…
## $ PC45 <dbl> 0.85225508, -0.02795243, -0.23193266, 0.14144910, 0.196743…
## $ PC46 <dbl> -1.11237020, -0.38125206, -0.26954753, -0.32090246, 0.2303…
## $ PC47 <dbl> -0.00902313, 0.04741104, -0.29706252, 0.31522767, 0.788628…
## $ PC48 <dbl> -1.47584035, -0.71158883, -1.45934411, -1.07873424, -0.830…
## $ PC49 <dbl> 0.40984400, -1.33178062, -0.09605598, 0.30938402, 0.372201…
## $ PC50 <dbl> -0.57390759, -0.08459423, -0.94028183, 0.59784893, 0.20823…
## $ PC51 <dbl> 1.063442470, -0.405748078, 2.078425158, -1.025744948, -0.9…
## $ PC52 <dbl> -0.044439906, 0.114998513, -0.464920093, 0.322877717, 0.26…
## $ PC53 <dbl> 0.87313900, -0.11243619, -0.66238661, -0.54958418, -0.9967…
## $ PC54 <dbl> -0.442710729, 0.046309654, -0.102743134, 0.356350472, 0.62…
## $ PC55 <dbl> 0.14954802, -0.25820110, 0.82154642, 0.58510570, -1.104198…
## $ PC56 <dbl> -1.06952013, 0.03078531, -1.43386699, 0.21098818, 0.920213…
## $ PC57 <dbl> 0.29288489, 0.26826192, -0.63992818, 1.00094670, -0.979938…
## $ PC58 <dbl> 1.364762383, 0.025844325, 0.511943078, -1.008457091, -0.50…
## $ PC59 <dbl> 0.397231878, 0.299711257, -0.461322121, -0.212515415, -0.5…
## $ PC60 <dbl> -0.839812887, 0.176180642, -0.274033209, -0.837794120, -1.…
## $ PC61 <dbl> -0.433471168, 0.150933565, -0.952233833, 0.053588886, -0.2…
## $ PC62 <dbl> -0.12760793, 0.18479726, 0.20301537, 0.22455532, 1.0908803…
#Specify model
xgboost_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_rec) %>%
add_model(xgboost_spec)
##tune hyperparameters
set.seed(65743)
xgboost_tune <-
tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 2,
control = control_grid(save_pred = TRUE)
)
## → A | warning: ! The following columns have zero variance so scaling cannot be used:
## market_segment_Undefined, distribution_channel_Undefined,
## reserved_room_type_L, reserved_room_type_P, and deposit_type_Refundable.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
## There were issues with some computations A: x1 → B | warning: ! The following columns have zero variance so scaling cannot be used:
## market_segment_Undefined, distribution_channel_GDS,
## distribution_channel_Undefined, reserved_room_type_L, and
## reserved_room_type_P.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
## There were issues with some computations A: x1There were issues with some computations A: x1 B: x1 → C | warning: ! The following columns have zero variance so scaling cannot be used:
## market_segment_Undefined, distribution_channel_Undefined,
## reserved_room_type_L, and reserved_room_type_P.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
## There were issues with some computations A: x1 B: x1There were issues with some computations A: x1 B: x1 C: x1There were issues with some computations A: x1 B: x1 C: x1
#Model evaluation
collect_metrics(xgboost_tune)
## # A tibble: 6 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 1 40 1 0.316 31.6 1 accuracy
## 2 1 40 1 0.316 31.6 1 brier_class
## 3 1 40 1 0.316 31.6 1 roc_auc
## 4 2000 2 15 0.001 0.0000000001 0.5 accuracy
## 5 2000 2 15 0.001 0.0000000001 0.5 brier_class
## 6 2000 2 15 0.001 0.0000000001 0.5 roc_auc
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(is_canceled, .pred_Canceled) %>%
autoplot()
#Fit model for last time
xgboost_last <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
last_fit(data_split)
## → A | warning: ! The following columns have zero variance so scaling cannot be used:
## market_segment_Undefined, distribution_channel_Undefined,
## reserved_room_type_L, and reserved_room_type_P.
## ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
## before normalizing.
## There were issues with some computations A: x1There were issues with some computations A: x1
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.908 pre0_mod0_post0
## 2 roc_auc binary 0.929 pre0_mod0_post0
## 3 brier_class binary 0.0917 pre0_mod0_post0
collect_predictions(xgboost_last) %>%
yardstick::conf_mat(is_canceled, .pred_class) %>%
autoplot()
#variable importance
xgboost_last %>%
workflows::extract_fit_engine() %>%
vip()