hotels <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-11/hotels.csv')
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

skimr::skim(hotels)
Data summary
Name hotels
Number of rows 119390
Number of columns 32
_______________________
Column type frequency:
character 13
Date 1
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hotel 0 1 10 12 0 2 0
arrival_date_month 0 1 3 9 0 12 0
meal 0 1 2 9 0 5 0
country 0 1 2 4 0 178 0
market_segment 0 1 6 13 0 8 0
distribution_channel 0 1 3 9 0 5 0
reserved_room_type 0 1 1 1 0 10 0
assigned_room_type 0 1 1 1 0 12 0
deposit_type 0 1 10 10 0 3 0
agent 0 1 1 4 0 334 0
company 0 1 1 4 0 353 0
customer_type 0 1 5 15 0 4 0
reservation_status 0 1 7 9 0 3 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
is_canceled 0 1 0.37 0.48 0.00 0.00 0.00 1 1 ▇▁▁▁▅
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737 ▇▂▁▁▁
arrival_date_year 0 1 2016.16 0.71 2015.00 2016.00 2016.00 2017 2017 ▃▁▇▁▆
arrival_date_week_number 0 1 27.17 13.61 1.00 16.00 28.00 38 53 ▅▇▇▇▅
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31 ▇▇▇▇▆
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19 ▇▁▁▁▁
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50 ▇▁▁▁▁
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55 ▇▁▁▁▁
children 4 1 0.10 0.40 0.00 0.00 0.00 0 10 ▇▁▁▁▁
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10 ▇▁▁▁▁
is_repeated_guest 0 1 0.03 0.18 0.00 0.00 0.00 0 1 ▇▁▁▁▁
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26 ▇▁▁▁▁
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72 ▇▁▁▁▁
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21 ▇▁▁▁▁
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391 ▇▁▁▁▁
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400 ▇▁▁▁▁
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8 ▇▁▁▁▁
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5 ▇▁▁▁▁
# Create a vector of column names to convert to factors
factors_vec <- hotels %>% 
  select(hotel, arrival_date_month, meal, country, market_segment,
         distribution_channel, reserved_room_type, deposit_type,
         customer_type, reservation_status, is_repeated_guest,
         is_canceled) %>% 
  names()

# Clean the data
data_clean <- hotels %>%
  
  # Address factors imported as character/numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop only the specific problem variables
  select(-company, -agent, -reservation_status, -country, -assigned_room_type) %>%
  
  # Remove missing values
  drop_na()
data_clean %>% count(is_canceled)
## # A tibble: 2 × 2
##   is_canceled     n
##   <fct>       <int>
## 1 0           75166
## 2 1           44220
data_clean %>% 
  ggplot(aes(is_canceled)) + 
  geom_bar()

Cancellation vs lead time

data_clean %>% 
  ggplot(aes(is_canceled, lead_time)) + 
  geom_boxplot()

Correlation Plot

# Step 1: Binarize the data
data_binarized <- data_clean %>%
  select(-reservation_status_date) %>%
  binarize()

# Step 2: Correlation analysis against the target variable
data_correlation <- data_binarized %>%
  correlate(target = is_canceled__1)

data_correlation
## # A tibble: 99 × 3
##    feature                   bin        correlation
##    <fct>                     <chr>            <dbl>
##  1 is_canceled               0               -1    
##  2 is_canceled               1                1    
##  3 deposit_type              Non_Refund       0.481
##  4 deposit_type              No_Deposit      -0.478
##  5 previous_cancellations    1                0.275
##  6 previous_cancellations    0               -0.271
##  7 lead_time                 -Inf_18         -0.270
##  8 total_of_special_requests 0                0.265
##  9 market_segment            Groups           0.222
## 10 lead_time                 160_Inf          0.220
## # ℹ 89 more rows
# Step 3: Plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 55 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

predict cancellation

set.seed(123)

hotel_split <- initial_split(data_clean, prop = 0.75, strata = is_canceled)

hotel_train <- training(hotel_split)
hotel_test  <- testing(hotel_split)
hotel_recipe <- recipe(is_canceled ~ ., data = hotel_train) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors())
log_model <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")
hotel_workflow <- workflow() %>%
  add_recipe(hotel_recipe) %>%
  add_model(log_model)
hotel_fit <- hotel_workflow %>%
  fit(data = hotel_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
hotel_predictions <- predict(hotel_fit, hotel_test, type = "prob") %>%
  bind_cols(predict(hotel_fit, hotel_test)) %>%
  bind_cols(hotel_test)
hotel_predictions %>%
  roc_auc(truth = is_canceled, .pred_1)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary     0.0000401
hotel_predictions %>%
  accuracy(truth = is_canceled, .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.998

Summary

This assignment looks at hotel booking data to figure out why customers cancel their reservations and how we can predict it. First, the data is cleaned up and explored using summaries and graphs to spot patterns. Then, a correlation funnel is used to see which factors are most related to cancellations. Finally, a logistic regression model is built to predict whether a booking will be canceled, and its performance is checked using accuracy and ROC AUC.