hotels <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-02-11/hotels.csv')

## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

skimr::skim(hotels)

Data summary
Name	hotels
Number of rows	119390
Number of columns	32
_______________________
Column type frequency:
character	13
Date	1
numeric	18
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
hotel	1	10	12	2
arrival_date_month	1	3	9	12
meal	1	2	9	5
country	1	2	4	178
market_segment	1	6	13	8
distribution_channel	1	3	9	5
reserved_room_type	1	1	1	10
assigned_room_type	1	1	1	12
deposit_type	1	10	10	3
agent	1	1	4	334
company	1	1	4	353
customer_type	1	5	15	4
reservation_status	1	7	9	3

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
reservation_status_date	0	1	2014-10-17	2017-09-14	2016-08-07	926

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
is_canceled	0	1	0.37	0.48	0.00	0.00	0.00	1	1	▇▁▁▁▅
lead_time	0	1	104.01	106.86	0.00	18.00	69.00	160	737	▇▂▁▁▁
arrival_date_year	0	1	2016.16	0.71	2015.00	2016.00	2016.00	2017	2017	▃▁▇▁▆
arrival_date_week_number	0	1	27.17	13.61	1.00	16.00	28.00	38	53	▅▇▇▇▅
arrival_date_day_of_month	0	1	15.80	8.78	1.00	8.00	16.00	23	31	▇▇▇▇▆
stays_in_weekend_nights	0	1	0.93	1.00	0.00	0.00	1.00	2	19	▇▁▁▁▁
stays_in_week_nights	0	1	2.50	1.91	0.00	1.00	2.00	3	50	▇▁▁▁▁
adults	0	1	1.86	0.58	0.00	2.00	2.00	2	55	▇▁▁▁▁
children	4	1	0.10	0.40	0.00	0.00	0.00	0	10	▇▁▁▁▁
babies	0	1	0.01	0.10	0.00	0.00	0.00	0	10	▇▁▁▁▁
is_repeated_guest	0	1	0.03	0.18	0.00	0.00	0.00	0	1	▇▁▁▁▁
previous_cancellations	0	1	0.09	0.84	0.00	0.00	0.00	0	26	▇▁▁▁▁
previous_bookings_not_canceled	0	1	0.14	1.50	0.00	0.00	0.00	0	72	▇▁▁▁▁
booking_changes	0	1	0.22	0.65	0.00	0.00	0.00	0	21	▇▁▁▁▁
days_in_waiting_list	0	1	2.32	17.59	0.00	0.00	0.00	0	391	▇▁▁▁▁
adr	0	1	101.83	50.54	-6.38	69.29	94.58	126	5400	▇▁▁▁▁
required_car_parking_spaces	0	1	0.06	0.25	0.00	0.00	0.00	0	8	▇▁▁▁▁
total_of_special_requests	0	1	0.57	0.79	0.00	0.00	0.00	1	5	▇▁▁▁▁

# Create a vector of column names to convert to factors
factors_vec <- hotels %>% 
  select(hotel, arrival_date_month, meal, country, market_segment,
         distribution_channel, reserved_room_type, deposit_type,
         customer_type, reservation_status, is_repeated_guest,
         is_canceled) %>% 
  names()

# Clean the data
data_clean <- hotels %>%
  
  # Address factors imported as character/numeric
  mutate(across(all_of(factors_vec), as.factor)) %>%
  
  # Drop only the specific problem variables
  select(-company, -agent, -reservation_status, -country, -assigned_room_type) %>%
  
  # Remove missing values
  drop_na()

data_clean %>% count(is_canceled)

## # A tibble: 2 × 2
##   is_canceled     n
##   <fct>       <int>
## 1 0           75166
## 2 1           44220

data_clean %>% 
  ggplot(aes(is_canceled)) + 
  geom_bar()

Cancellation vs lead time

data_clean %>% 
  ggplot(aes(is_canceled, lead_time)) + 
  geom_boxplot()

Correlation Plot

# Step 1: Binarize the data
data_binarized <- data_clean %>%
  select(-reservation_status_date) %>%
  binarize()

# Step 2: Correlation analysis against the target variable
data_correlation <- data_binarized %>%
  correlate(target = is_canceled__1)

data_correlation

## # A tibble: 99 × 3
##    feature                   bin        correlation
##    <fct>                     <chr>            <dbl>
##  1 is_canceled               0               -1    
##  2 is_canceled               1                1    
##  3 deposit_type              Non_Refund       0.481
##  4 deposit_type              No_Deposit      -0.478
##  5 previous_cancellations    1                0.275
##  6 previous_cancellations    0               -0.271
##  7 lead_time                 -Inf_18         -0.270
##  8 total_of_special_requests 0                0.265
##  9 market_segment            Groups           0.222
## 10 lead_time                 160_Inf          0.220
## # ℹ 89 more rows

# Step 3: Plot
data_correlation %>%
  correlationfunnel::plot_correlation_funnel()

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: ggrepel: 55 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

predict cancellation

set.seed(123)

hotel_split <- initial_split(data_clean, prop = 0.75, strata = is_canceled)

hotel_train <- training(hotel_split)
hotel_test  <- testing(hotel_split)

hotel_recipe <- recipe(is_canceled ~ ., data = hotel_train) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_predictors())

log_model <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

hotel_workflow <- workflow() %>%
  add_recipe(hotel_recipe) %>%
  add_model(log_model)

hotel_fit <- hotel_workflow %>%
  fit(data = hotel_train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

hotel_predictions <- predict(hotel_fit, hotel_test, type = "prob") %>%
  bind_cols(predict(hotel_fit, hotel_test)) %>%
  bind_cols(hotel_test)

hotel_predictions %>%
  roc_auc(truth = is_canceled, .pred_1)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary     0.0000401

hotel_predictions %>%
  accuracy(truth = is_canceled, .pred_class)

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.998

Summary

This assignment looks at hotel booking data to figure out why customers cancel their reservations and how we can predict it. First, the data is cleaned up and explored using summaries and graphs to spot patterns. Then, a correlation funnel is used to see which factors are most related to cancellations. Finally, a logistic regression model is built to predict whether a booking will be canceled, and its performance is checked using accuracy and ROC AUC.

Apply 5

Matthew Plutzner

2026-04-12

Correlation Plot

Summary