Load required libraries

library(conflicted)
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
library(ggplot2)

Read data

hotel_bookings <- read.csv("challenge_datasets/hotel_bookings.csv")
as_tibble(hotel_bookings)
## # A tibble: 119,390 × 32
##    hotel        is_canceled lead_time arrival_date_year arrival_date_month
##    <chr>              <int>     <int>             <int> <chr>             
##  1 Resort Hotel           0       342              2015 July              
##  2 Resort Hotel           0       737              2015 July              
##  3 Resort Hotel           0         7              2015 July              
##  4 Resort Hotel           0        13              2015 July              
##  5 Resort Hotel           0        14              2015 July              
##  6 Resort Hotel           0        14              2015 July              
##  7 Resort Hotel           0         0              2015 July              
##  8 Resort Hotel           0         9              2015 July              
##  9 Resort Hotel           1        85              2015 July              
## 10 Resort Hotel           1        75              2015 July              
## # ℹ 119,380 more rows
## # ℹ 27 more variables: arrival_date_week_number <int>,
## #   arrival_date_day_of_month <int>, stays_in_weekend_nights <int>,
## #   stays_in_week_nights <int>, adults <int>, children <int>, babies <int>,
## #   meal <chr>, country <chr>, market_segment <chr>,
## #   distribution_channel <chr>, is_repeated_guest <int>,
## #   previous_cancellations <int>, previous_bookings_not_canceled <int>, …

Analyzing data

In this section, we will do an elementary analysis to understand the data.

# Visualize the first few rows of the data
head(hotel_bookings)
##          hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel           0       342              2015               July
## 2 Resort Hotel           0       737              2015               July
## 3 Resort Hotel           0         7              2015               July
## 4 Resort Hotel           0        13              2015               July
## 5 Resort Hotel           0        14              2015               July
## 6 Resort Hotel           0        14              2015               July
##   arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1                       27                         1                       0
## 2                       27                         1                       0
## 3                       27                         1                       0
## 4                       27                         1                       0
## 5                       27                         1                       0
## 6                       27                         1                       0
##   stays_in_week_nights adults children babies meal country market_segment
## 1                    0      2        0      0   BB     PRT         Direct
## 2                    0      2        0      0   BB     PRT         Direct
## 3                    1      1        0      0   BB     GBR         Direct
## 4                    1      1        0      0   BB     GBR      Corporate
## 5                    2      2        0      0   BB     GBR      Online TA
## 6                    2      2        0      0   BB     GBR      Online TA
##   distribution_channel is_repeated_guest previous_cancellations
## 1               Direct                 0                      0
## 2               Direct                 0                      0
## 3               Direct                 0                      0
## 4            Corporate                 0                      0
## 5                TA/TO                 0                      0
## 6                TA/TO                 0                      0
##   previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1                              0                  C                  C
## 2                              0                  C                  C
## 3                              0                  A                  C
## 4                              0                  A                  A
## 5                              0                  A                  A
## 6                              0                  A                  A
##   booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1               3   No Deposit  NULL    NULL                    0     Transient
## 2               4   No Deposit  NULL    NULL                    0     Transient
## 3               0   No Deposit  NULL    NULL                    0     Transient
## 4               0   No Deposit   304    NULL                    0     Transient
## 5               0   No Deposit   240    NULL                    0     Transient
## 6               0   No Deposit   240    NULL                    0     Transient
##   adr required_car_parking_spaces total_of_special_requests reservation_status
## 1   0                           0                         0          Check-Out
## 2   0                           0                         0          Check-Out
## 3  75                           0                         0          Check-Out
## 4  75                           0                         0          Check-Out
## 5  98                           0                         1          Check-Out
## 6  98                           0                         1          Check-Out
##   reservation_status_date
## 1              2015-07-01
## 2              2015-07-01
## 3              2015-07-02
## 4              2015-07-02
## 5              2015-07-03
## 6              2015-07-03

The summary table provides a lot of insights on the hotel bookings data. * is_cancelled: The data reveals that about 37% of the bookings are cancelled. * lead_time: The number of days between the booking date and arrival date. The average lead time is around 104 days. * arrival_date_year: Bookings span from 2015 to 2017. * stays_in_weekend_nights and stays_in_week_nights: Guests stay, on average, 1 day over the weekend and 2.5 days during the week. * adults: Most bookings are for two adults, likely indicating couples or pairs of travelers. * children and babies: There are very few bookings with children or babies. * is_repeated_guest: Only about 3.19% of the bookings were made by repeat guests. * previous_cancellations: On average, there’s a low number of previous cancellations per guest, with a mean around 0.087. * booking_changes: Changes to bookings are relatively infrequent, with an average of 0.221 changes per booking. * adr: The average daily rate for a room is about 94.58, indicating the average revenue per booked room.

# Get a summary of the dataset
summary(hotel_bookings)
##     hotel            is_canceled       lead_time   arrival_date_year
##  Length:119390      Min.   :0.0000   Min.   :  0   Min.   :2015     
##  Class :character   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
##  Mode  :character   Median :0.0000   Median : 69   Median :2016     
##                     Mean   :0.3704   Mean   :104   Mean   :2016     
##                     3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
##                     Max.   :1.0000   Max.   :737   Max.   :2017     
##                                                                     
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  Length:119390      Min.   : 1.00            Min.   : 1.0             
##  Class :character   1st Qu.:16.00            1st Qu.: 8.0             
##  Mode  :character   Median :28.00            Median :16.0             
##                     Mean   :27.17            Mean   :15.8             
##                     3rd Qu.:38.00            3rd Qu.:23.0             
##                     Max.   :53.00            Max.   :31.0             
##                                                                       
##  stays_in_weekend_nights stays_in_week_nights     adults      
##  Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
##  1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
##  Median : 1.0000         Median : 2.0         Median : 2.000  
##  Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
##  3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
##  Max.   :19.0000         Max.   :50.0         Max.   :55.000  
##                                                               
##     children           babies              meal             country         
##  Min.   : 0.0000   Min.   : 0.000000   Length:119390      Length:119390     
##  1st Qu.: 0.0000   1st Qu.: 0.000000   Class :character   Class :character  
##  Median : 0.0000   Median : 0.000000   Mode  :character   Mode  :character  
##  Mean   : 0.1039   Mean   : 0.007949                                        
##  3rd Qu.: 0.0000   3rd Qu.: 0.000000                                        
##  Max.   :10.0000   Max.   :10.000000                                        
##  NA's   :4                                                                  
##  market_segment     distribution_channel is_repeated_guest
##  Length:119390      Length:119390        Min.   :0.00000  
##  Class :character   Class :character     1st Qu.:0.00000  
##  Mode  :character   Mode  :character     Median :0.00000  
##                                          Mean   :0.03191  
##                                          3rd Qu.:0.00000  
##                                          Max.   :1.00000  
##                                                           
##  previous_cancellations previous_bookings_not_canceled reserved_room_type
##  Min.   : 0.00000       Min.   : 0.0000                Length:119390     
##  1st Qu.: 0.00000       1st Qu.: 0.0000                Class :character  
##  Median : 0.00000       Median : 0.0000                Mode  :character  
##  Mean   : 0.08712       Mean   : 0.1371                                  
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                                  
##  Max.   :26.00000       Max.   :72.0000                                  
##                                                                          
##  assigned_room_type booking_changes   deposit_type          agent          
##  Length:119390      Min.   : 0.0000   Length:119390      Length:119390     
##  Class :character   1st Qu.: 0.0000   Class :character   Class :character  
##  Mode  :character   Median : 0.0000   Mode  :character   Mode  :character  
##                     Mean   : 0.2211                                        
##                     3rd Qu.: 0.0000                                        
##                     Max.   :21.0000                                        
##                                                                            
##    company          days_in_waiting_list customer_type           adr         
##  Length:119390      Min.   :  0.000      Length:119390      Min.   :  -6.38  
##  Class :character   1st Qu.:  0.000      Class :character   1st Qu.:  69.29  
##  Mode  :character   Median :  0.000      Mode  :character   Median :  94.58  
##                     Mean   :  2.321                         Mean   : 101.83  
##                     3rd Qu.:  0.000                         3rd Qu.: 126.00  
##                     Max.   :391.000                         Max.   :5400.00  
##                                                                              
##  required_car_parking_spaces total_of_special_requests reservation_status
##  Min.   :0.00000             Min.   :0.0000            Length:119390     
##  1st Qu.:0.00000             1st Qu.:0.0000            Class :character  
##  Median :0.00000             Median :0.0000            Mode  :character  
##  Mean   :0.06252             Mean   :0.5714                              
##  3rd Qu.:0.00000             3rd Qu.:1.0000                              
##  Max.   :8.00000             Max.   :5.0000                              
##                                                                          
##  reservation_status_date
##  Length:119390          
##  Class :character       
##  Mode  :character       
##                         
##                         
##                         
## 
# View the structure of the data
str(hotel_bookings)
## 'data.frame':    119390 obs. of  32 variables:
##  $ hotel                         : chr  "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
##  $ is_canceled                   : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ lead_time                     : int  342 737 7 13 14 14 0 9 85 75 ...
##  $ arrival_date_year             : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ arrival_date_month            : chr  "July" "July" "July" "July" ...
##  $ arrival_date_week_number      : int  27 27 27 27 27 27 27 27 27 27 ...
##  $ arrival_date_day_of_month     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : int  0 0 1 1 2 2 2 2 3 3 ...
##  $ adults                        : int  2 2 1 1 2 2 2 2 2 2 ...
##  $ children                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ babies                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ meal                          : chr  "BB" "BB" "BB" "BB" ...
##  $ country                       : chr  "PRT" "PRT" "GBR" "GBR" ...
##  $ market_segment                : chr  "Direct" "Direct" "Direct" "Corporate" ...
##  $ distribution_channel          : chr  "Direct" "Direct" "Direct" "Corporate" ...
##  $ is_repeated_guest             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : chr  "C" "C" "A" "A" ...
##  $ assigned_room_type            : chr  "C" "C" "C" "A" ...
##  $ booking_changes               : int  3 4 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : chr  "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
##  $ agent                         : chr  "NULL" "NULL" "NULL" "304" ...
##  $ company                       : chr  "NULL" "NULL" "NULL" "NULL" ...
##  $ days_in_waiting_list          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : chr  "Transient" "Transient" "Transient" "Transient" ...
##  $ adr                           : num  0 0 75 75 98 ...
##  $ required_car_parking_spaces   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ reservation_status            : chr  "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
##  $ reservation_status_date       : chr  "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...

We get to understand the hotel types using the following command.

# Get unique hotel types, countries
print(unique(hotel_bookings$hotel))
## [1] "Resort Hotel" "City Hotel"
print(unique(hotel_bookings$country))
##   [1] "PRT"  "GBR"  "USA"  "ESP"  "IRL"  "FRA"  "NULL" "ROU"  "NOR"  "OMN" 
##  [11] "ARG"  "POL"  "DEU"  "BEL"  "CHE"  "CN"   "GRC"  "ITA"  "NLD"  "DNK" 
##  [21] "RUS"  "SWE"  "AUS"  "EST"  "CZE"  "BRA"  "FIN"  "MOZ"  "BWA"  "LUX" 
##  [31] "SVN"  "ALB"  "IND"  "CHN"  "MEX"  "MAR"  "UKR"  "SMR"  "LVA"  "PRI" 
##  [41] "SRB"  "CHL"  "AUT"  "BLR"  "LTU"  "TUR"  "ZAF"  "AGO"  "ISR"  "CYM" 
##  [51] "ZMB"  "CPV"  "ZWE"  "DZA"  "KOR"  "CRI"  "HUN"  "ARE"  "TUN"  "JAM" 
##  [61] "HRV"  "HKG"  "IRN"  "GEO"  "AND"  "GIB"  "URY"  "JEY"  "CAF"  "CYP" 
##  [71] "COL"  "GGY"  "KWT"  "NGA"  "MDV"  "VEN"  "SVK"  "FJI"  "KAZ"  "PAK" 
##  [81] "IDN"  "LBN"  "PHL"  "SEN"  "SYC"  "AZE"  "BHR"  "NZL"  "THA"  "DOM" 
##  [91] "MKD"  "MYS"  "ARM"  "JPN"  "LKA"  "CUB"  "CMR"  "BIH"  "MUS"  "COM" 
## [101] "SUR"  "UGA"  "BGR"  "CIV"  "JOR"  "SYR"  "SGP"  "BDI"  "SAU"  "VNM" 
## [111] "PLW"  "QAT"  "EGY"  "PER"  "MLT"  "MWI"  "ECU"  "MDG"  "ISL"  "UZB" 
## [121] "NPL"  "BHS"  "MAC"  "TGO"  "TWN"  "DJI"  "STP"  "KNA"  "ETH"  "IRQ" 
## [131] "HND"  "RWA"  "KHM"  "MCO"  "BGD"  "IMN"  "TJK"  "NIC"  "BEN"  "VGB" 
## [141] "TZA"  "GAB"  "GHA"  "TMP"  "GLP"  "KEN"  "LIE"  "GNB"  "MNE"  "UMI" 
## [151] "MYT"  "FRO"  "MMR"  "PAN"  "BFA"  "LBY"  "MLI"  "NAM"  "BOL"  "PRY" 
## [161] "BRB"  "ABW"  "AIA"  "SLV"  "DMA"  "PYF"  "GUY"  "LCA"  "ATA"  "GTM" 
## [171] "ASM"  "MRT"  "NCL"  "KIR"  "SDN"  "ATF"  "SLE"  "LAO"

Data wrangling

Booking cancelled / not-cancelled

The summary statistics indicate that bookings which were eventually canceled had a higher mean lead time of approximately 145 days, compared to a mean lead time of about 80 days for non-canceled bookings. The variation in lead times is also larger for canceled bookings, with a standard deviation of about 119 days, as opposed to 91 days for non-canceled bookings. This suggests a tendency for longer planned stays to be more susceptible to cancellation.

hotel_bookings %>%
  group_by(is_canceled) %>%
  summarise(
    mean_lead_time = mean(lead_time),
    median_lead_time = median(lead_time),
    sd_lead_time = sd(lead_time),
    min_lead_time = min(lead_time),
    max_lead_time = max(lead_time)
  )
## # A tibble: 2 × 6
##   is_canceled mean_lead_time median_lead_time sd_lead_time min_lead_time
##         <int>          <dbl>            <dbl>        <dbl>         <int>
## 1           0           80.0               45         91.1             0
## 2           1          145.               113        119.              0
## # ℹ 1 more variable: max_lead_time <int>

ADR by Hotel Category

For City Hotel, the average daily rate (ADR) is around 105.30 with a median of 99.9, while the Resort Hotel has a lower average ADR of approximately 94.95 with a median of 75. The standard deviation is higher (indicating seasonal interests) for the Resort Hotel at 61.44 compared to 43.60 for the City Hotel, indicating more variability in rates at the Resort Hotel. Interestingly, the minimum ADR for the Resort Hotel is negative, which could suggest some data entry errors or promotions.

hotel_bookings %>%
  group_by(hotel) %>%
  summarise(
    mean_adr = mean(adr),
    median_adr = median(adr),
    sd_adr = sd(adr),
    min_adr = min(adr),
    max_adr = max(adr)
  )
## # A tibble: 2 × 6
##   hotel        mean_adr median_adr sd_adr min_adr max_adr
##   <chr>           <dbl>      <dbl>  <dbl>   <dbl>   <dbl>
## 1 City Hotel      105.        99.9   43.6    0       5400
## 2 Resort Hotel     95.0       75     61.4   -6.38     508

Customer type

Contract customers request the most extras, while Transient-Party customers request the least, and most customers across all types don’t request any extras at all. Despite the variation in requests, the maximum number of special requests is capped at 5 for all customer types.

hotel_bookings %>%
  group_by(customer_type) %>%
  summarise(
    mean_special_requests = mean(total_of_special_requests),
    median_special_requests = median(total_of_special_requests),
    sd_special_requests = sd(total_of_special_requests),
    min_special_requests = min(total_of_special_requests),
    max_special_requests = max(total_of_special_requests)
  )
## # A tibble: 4 × 6
##   customer_type mean_special_requests median_special_reque…¹ sd_special_requests
##   <chr>                         <dbl>                  <dbl>               <dbl>
## 1 Contract                      0.729                      0               0.879
## 2 Group                         0.645                      0               0.842
## 3 Transient                     0.632                      0               0.817
## 4 Transient-Pa…                 0.329                      0               0.621
## # ℹ abbreviated name: ¹​median_special_requests
## # ℹ 2 more variables: min_special_requests <int>, max_special_requests <int>

Meal types

Bookings with Half Board (HB) have the longest average lead time and highest average daily rate (ADR), while Self Catering (SC) has the shortest lead time and a moderate ADR. Full Board (FB) and Bed & Breakfast (BB) fall in between, with FB having a higher ADR than BB. Undefined meal options have a lower ADR than HB and FB, suggesting less expenditure on meals correlates with lower spending on accommodation.

meal_lead_time_adr <- hotel_bookings %>%
  group_by(meal) %>%
  summarise(
    mean_lead_time = mean(lead_time),
    mean_adr = mean(adr)
  ) %>%
  arrange(desc(mean_lead_time))

meal_lead_time_adr
## # A tibble: 5 × 3
##   meal      mean_lead_time mean_adr
##   <chr>              <dbl>    <dbl>
## 1 HB                 143.     120. 
## 2 FB                 117.     109. 
## 3 BB                 102.      99.4
## 4 Undefined           87.3     91.9
## 5 SC                  70.7     98.3

Room types

Room type H has the highest cancellation rate and a relatively high average stay length, while room type L has a low average stay length and a perfect cancellation rate, indicating all bookings were canceled. Variability in stay length is moderate across all room types, with room type P showing the highest variability. (P showing 100% cancellation might be due to insufficient data points)

# Analyzing booking cancellations by room type
hotel_bookings %>%
  group_by(reserved_room_type) %>%
  summarise(
    cancellation_rate = mean(is_canceled),
    avg_stay_length = mean(stays_in_week_nights),
    sd_stay_length = sd(stays_in_week_nights)
  )
## # A tibble: 10 × 4
##    reserved_room_type cancellation_rate avg_stay_length sd_stay_length
##    <chr>                          <dbl>           <dbl>          <dbl>
##  1 A                              0.391           2.29            1.73
##  2 B                              0.329           2.44            1.44
##  3 C                              0.330           3.37            2.09
##  4 D                              0.318           2.99            2.13
##  5 E                              0.293           3.41            2.66
##  6 F                              0.304           2.68            1.97
##  7 G                              0.364           3.09            2.16
##  8 H                              0.408           2.93            1.98
##  9 L                              0.333           1               0   
## 10 P                              1               0.917           3.18

Month-wise analysis

August shows the highest average daily rate among all months, while February has the lowest. The data also reveals a significant range in rates within months; for instance, March has an unusually high maximum daily rate compared to other months. This could indicate seasonality in pricing or the presence of special events or premium options in certain periods. Negative minimum daily rates suggests possible data entry issues or promotions that need further investigation.

# Analyzing average daily rates by month
hotel_bookings %>%
  group_by(arrival_date_month) %>%
  summarise(
    average_daily_rate = mean(adr),
    max_daily_rate = max(adr),
    min_daily_rate = min(adr)
  ) %>%
  arrange(desc(average_daily_rate))
## # A tibble: 12 × 4
##    arrival_date_month average_daily_rate max_daily_rate min_daily_rate
##    <chr>                           <dbl>          <dbl>          <dbl>
##  1 August                          140.            450            0   
##  2 July                            127.            508            0   
##  3 June                            117.            334.           0   
##  4 May                             109.            510            0   
##  5 September                       105.            345            0   
##  6 April                           100.            336            0   
##  7 October                          87.9           300            0   
##  8 December                         81.1           452.           0   
##  9 March                            80.7          5400           -6.38
## 10 November                         73.8           260            0   
## 11 February                         73.6           284.           0   
## 12 January                          70.4           284.           0

Special requests

The cancellation rate for hotel bookings decreases as the number of special requests increases. Bookings with no special requests have the highest cancellation rate at approximately 47.72%, while those with five special requests have a significantly lower cancellation rate of 5%. This trend suggests that bookings with more special requests are less likely to be canceled, potentially indicating a higher commitment level from guests who make such requests.

hotel_bookings %>%
  group_by(total_of_special_requests) %>%
  summarise(
    cancellation_rate = mean(is_canceled)
  ) %>%
  arrange(desc(cancellation_rate))
## # A tibble: 6 × 2
##   total_of_special_requests cancellation_rate
##                       <int>             <dbl>
## 1                         0             0.477
## 2                         2             0.221
## 3                         1             0.220
## 4                         3             0.179
## 5                         4             0.106
## 6                         5             0.05
special_requests_cancellation <- hotel_bookings %>%
  group_by(total_of_special_requests) %>%
  summarise(cancellation_rate = mean(is_canceled)) %>%
  arrange(desc(cancellation_rate))

# Create a line plot
ggplot(special_requests_cancellation, aes(x = total_of_special_requests, y = cancellation_rate, group = 1)) +
  geom_line() +
  geom_point() +
  labs(x = "Total of Special Requests", y = "Cancellation Rate", title = "Cancellation Rate by Number of Special Requests") +
  theme_minimal()