library(conflicted)
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
library(ggplot2)
hotel_bookings <- read.csv("challenge_datasets/hotel_bookings.csv")
as_tibble(hotel_bookings)
## # A tibble: 119,390 × 32
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## <chr> <int> <int> <int> <chr>
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## 7 Resort Hotel 0 0 2015 July
## 8 Resort Hotel 0 9 2015 July
## 9 Resort Hotel 1 85 2015 July
## 10 Resort Hotel 1 75 2015 July
## # ℹ 119,380 more rows
## # ℹ 27 more variables: arrival_date_week_number <int>,
## # arrival_date_day_of_month <int>, stays_in_weekend_nights <int>,
## # stays_in_week_nights <int>, adults <int>, children <int>, babies <int>,
## # meal <chr>, country <chr>, market_segment <chr>,
## # distribution_channel <chr>, is_repeated_guest <int>,
## # previous_cancellations <int>, previous_bookings_not_canceled <int>, …
In this section, we will do an elementary analysis to understand the data.
# Visualize the first few rows of the data
head(hotel_bookings)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1 27 1 0
## 2 27 1 0
## 3 27 1 0
## 4 27 1 0
## 5 27 1 0
## 6 27 1 0
## stays_in_week_nights adults children babies meal country market_segment
## 1 0 2 0 0 BB PRT Direct
## 2 0 2 0 0 BB PRT Direct
## 3 1 1 0 0 BB GBR Direct
## 4 1 1 0 0 BB GBR Corporate
## 5 2 2 0 0 BB GBR Online TA
## 6 2 2 0 0 BB GBR Online TA
## distribution_channel is_repeated_guest previous_cancellations
## 1 Direct 0 0
## 2 Direct 0 0
## 3 Direct 0 0
## 4 Corporate 0 0
## 5 TA/TO 0 0
## 6 TA/TO 0 0
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1 0 C C
## 2 0 C C
## 3 0 A C
## 4 0 A A
## 5 0 A A
## 6 0 A A
## booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1 3 No Deposit NULL NULL 0 Transient
## 2 4 No Deposit NULL NULL 0 Transient
## 3 0 No Deposit NULL NULL 0 Transient
## 4 0 No Deposit 304 NULL 0 Transient
## 5 0 No Deposit 240 NULL 0 Transient
## 6 0 No Deposit 240 NULL 0 Transient
## adr required_car_parking_spaces total_of_special_requests reservation_status
## 1 0 0 0 Check-Out
## 2 0 0 0 Check-Out
## 3 75 0 0 Check-Out
## 4 75 0 0 Check-Out
## 5 98 0 1 Check-Out
## 6 98 0 1 Check-Out
## reservation_status_date
## 1 2015-07-01
## 2 2015-07-01
## 3 2015-07-02
## 4 2015-07-02
## 5 2015-07-03
## 6 2015-07-03
The summary table provides a lot of insights on the hotel bookings data. * is_cancelled: The data reveals that about 37% of the bookings are cancelled. * lead_time: The number of days between the booking date and arrival date. The average lead time is around 104 days. * arrival_date_year: Bookings span from 2015 to 2017. * stays_in_weekend_nights and stays_in_week_nights: Guests stay, on average, 1 day over the weekend and 2.5 days during the week. * adults: Most bookings are for two adults, likely indicating couples or pairs of travelers. * children and babies: There are very few bookings with children or babies. * is_repeated_guest: Only about 3.19% of the bookings were made by repeat guests. * previous_cancellations: On average, there’s a low number of previous cancellations per guest, with a mean around 0.087. * booking_changes: Changes to bookings are relatively infrequent, with an average of 0.221 changes per booking. * adr: The average daily rate for a room is about 94.58, indicating the average revenue per booked room.
# Get a summary of the dataset
summary(hotel_bookings)
## hotel is_canceled lead_time arrival_date_year
## Length:119390 Min. :0.0000 Min. : 0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:119390 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.:16.00 1st Qu.: 8.0
## Mode :character Median :28.00 Median :16.0
## Mean :27.17 Mean :15.8
## 3rd Qu.:38.00 3rd Qu.:23.0
## Max. :53.00 Max. :31.0
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 Length:119390 Length:119390
## 1st Qu.: 0.0000 1st Qu.: 0.000000 Class :character Class :character
## Median : 0.0000 Median : 0.000000 Mode :character Mode :character
## Mean : 0.1039 Mean : 0.007949
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000
## Max. :10.0000 Max. :10.000000
## NA's :4
## market_segment distribution_channel is_repeated_guest
## Length:119390 Length:119390 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000
## Mean :0.03191
## 3rd Qu.:0.00000
## Max. :1.00000
##
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 Length:119390
## 1st Qu.: 0.00000 1st Qu.: 0.0000 Class :character
## Median : 0.00000 Median : 0.0000 Mode :character
## Mean : 0.08712 Mean : 0.1371
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :26.00000 Max. :72.0000
##
## assigned_room_type booking_changes deposit_type agent
## Length:119390 Min. : 0.0000 Length:119390 Length:119390
## Class :character 1st Qu.: 0.0000 Class :character Class :character
## Mode :character Median : 0.0000 Mode :character Mode :character
## Mean : 0.2211
## 3rd Qu.: 0.0000
## Max. :21.0000
##
## company days_in_waiting_list customer_type adr
## Length:119390 Min. : 0.000 Length:119390 Min. : -6.38
## Class :character 1st Qu.: 0.000 Class :character 1st Qu.: 69.29
## Mode :character Median : 0.000 Mode :character Median : 94.58
## Mean : 2.321 Mean : 101.83
## 3rd Qu.: 0.000 3rd Qu.: 126.00
## Max. :391.000 Max. :5400.00
##
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:119390
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :0.0000 Mode :character
## Mean :0.06252 Mean :0.5714
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :8.00000 Max. :5.0000
##
## reservation_status_date
## Length:119390
## Class :character
## Mode :character
##
##
##
##
# View the structure of the data
str(hotel_bookings)
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr "July" "July" "July" "July" ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr "BB" "BB" "BB" "BB" ...
## $ country : chr "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr "C" "C" "A" "A" ...
## $ assigned_room_type : chr "C" "C" "C" "A" ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr "NULL" "NULL" "NULL" "304" ...
## $ company : chr "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
We get to understand the hotel types using the following command.
# Get unique hotel types, countries
print(unique(hotel_bookings$hotel))
## [1] "Resort Hotel" "City Hotel"
print(unique(hotel_bookings$country))
## [1] "PRT" "GBR" "USA" "ESP" "IRL" "FRA" "NULL" "ROU" "NOR" "OMN"
## [11] "ARG" "POL" "DEU" "BEL" "CHE" "CN" "GRC" "ITA" "NLD" "DNK"
## [21] "RUS" "SWE" "AUS" "EST" "CZE" "BRA" "FIN" "MOZ" "BWA" "LUX"
## [31] "SVN" "ALB" "IND" "CHN" "MEX" "MAR" "UKR" "SMR" "LVA" "PRI"
## [41] "SRB" "CHL" "AUT" "BLR" "LTU" "TUR" "ZAF" "AGO" "ISR" "CYM"
## [51] "ZMB" "CPV" "ZWE" "DZA" "KOR" "CRI" "HUN" "ARE" "TUN" "JAM"
## [61] "HRV" "HKG" "IRN" "GEO" "AND" "GIB" "URY" "JEY" "CAF" "CYP"
## [71] "COL" "GGY" "KWT" "NGA" "MDV" "VEN" "SVK" "FJI" "KAZ" "PAK"
## [81] "IDN" "LBN" "PHL" "SEN" "SYC" "AZE" "BHR" "NZL" "THA" "DOM"
## [91] "MKD" "MYS" "ARM" "JPN" "LKA" "CUB" "CMR" "BIH" "MUS" "COM"
## [101] "SUR" "UGA" "BGR" "CIV" "JOR" "SYR" "SGP" "BDI" "SAU" "VNM"
## [111] "PLW" "QAT" "EGY" "PER" "MLT" "MWI" "ECU" "MDG" "ISL" "UZB"
## [121] "NPL" "BHS" "MAC" "TGO" "TWN" "DJI" "STP" "KNA" "ETH" "IRQ"
## [131] "HND" "RWA" "KHM" "MCO" "BGD" "IMN" "TJK" "NIC" "BEN" "VGB"
## [141] "TZA" "GAB" "GHA" "TMP" "GLP" "KEN" "LIE" "GNB" "MNE" "UMI"
## [151] "MYT" "FRO" "MMR" "PAN" "BFA" "LBY" "MLI" "NAM" "BOL" "PRY"
## [161] "BRB" "ABW" "AIA" "SLV" "DMA" "PYF" "GUY" "LCA" "ATA" "GTM"
## [171] "ASM" "MRT" "NCL" "KIR" "SDN" "ATF" "SLE" "LAO"
The summary statistics indicate that bookings which were eventually canceled had a higher mean lead time of approximately 145 days, compared to a mean lead time of about 80 days for non-canceled bookings. The variation in lead times is also larger for canceled bookings, with a standard deviation of about 119 days, as opposed to 91 days for non-canceled bookings. This suggests a tendency for longer planned stays to be more susceptible to cancellation.
hotel_bookings %>%
group_by(is_canceled) %>%
summarise(
mean_lead_time = mean(lead_time),
median_lead_time = median(lead_time),
sd_lead_time = sd(lead_time),
min_lead_time = min(lead_time),
max_lead_time = max(lead_time)
)
## # A tibble: 2 × 6
## is_canceled mean_lead_time median_lead_time sd_lead_time min_lead_time
## <int> <dbl> <dbl> <dbl> <int>
## 1 0 80.0 45 91.1 0
## 2 1 145. 113 119. 0
## # ℹ 1 more variable: max_lead_time <int>
For City Hotel, the average daily rate (ADR) is around 105.30 with a median of 99.9, while the Resort Hotel has a lower average ADR of approximately 94.95 with a median of 75. The standard deviation is higher (indicating seasonal interests) for the Resort Hotel at 61.44 compared to 43.60 for the City Hotel, indicating more variability in rates at the Resort Hotel. Interestingly, the minimum ADR for the Resort Hotel is negative, which could suggest some data entry errors or promotions.
hotel_bookings %>%
group_by(hotel) %>%
summarise(
mean_adr = mean(adr),
median_adr = median(adr),
sd_adr = sd(adr),
min_adr = min(adr),
max_adr = max(adr)
)
## # A tibble: 2 × 6
## hotel mean_adr median_adr sd_adr min_adr max_adr
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 City Hotel 105. 99.9 43.6 0 5400
## 2 Resort Hotel 95.0 75 61.4 -6.38 508
Contract customers request the most extras, while Transient-Party customers request the least, and most customers across all types don’t request any extras at all. Despite the variation in requests, the maximum number of special requests is capped at 5 for all customer types.
hotel_bookings %>%
group_by(customer_type) %>%
summarise(
mean_special_requests = mean(total_of_special_requests),
median_special_requests = median(total_of_special_requests),
sd_special_requests = sd(total_of_special_requests),
min_special_requests = min(total_of_special_requests),
max_special_requests = max(total_of_special_requests)
)
## # A tibble: 4 × 6
## customer_type mean_special_requests median_special_reque…¹ sd_special_requests
## <chr> <dbl> <dbl> <dbl>
## 1 Contract 0.729 0 0.879
## 2 Group 0.645 0 0.842
## 3 Transient 0.632 0 0.817
## 4 Transient-Pa… 0.329 0 0.621
## # ℹ abbreviated name: ¹​median_special_requests
## # ℹ 2 more variables: min_special_requests <int>, max_special_requests <int>
Bookings with Half Board (HB) have the longest average lead time and highest average daily rate (ADR), while Self Catering (SC) has the shortest lead time and a moderate ADR. Full Board (FB) and Bed & Breakfast (BB) fall in between, with FB having a higher ADR than BB. Undefined meal options have a lower ADR than HB and FB, suggesting less expenditure on meals correlates with lower spending on accommodation.
meal_lead_time_adr <- hotel_bookings %>%
group_by(meal) %>%
summarise(
mean_lead_time = mean(lead_time),
mean_adr = mean(adr)
) %>%
arrange(desc(mean_lead_time))
meal_lead_time_adr
## # A tibble: 5 × 3
## meal mean_lead_time mean_adr
## <chr> <dbl> <dbl>
## 1 HB 143. 120.
## 2 FB 117. 109.
## 3 BB 102. 99.4
## 4 Undefined 87.3 91.9
## 5 SC 70.7 98.3
Room type H has the highest cancellation rate and a relatively high average stay length, while room type L has a low average stay length and a perfect cancellation rate, indicating all bookings were canceled. Variability in stay length is moderate across all room types, with room type P showing the highest variability. (P showing 100% cancellation might be due to insufficient data points)
# Analyzing booking cancellations by room type
hotel_bookings %>%
group_by(reserved_room_type) %>%
summarise(
cancellation_rate = mean(is_canceled),
avg_stay_length = mean(stays_in_week_nights),
sd_stay_length = sd(stays_in_week_nights)
)
## # A tibble: 10 × 4
## reserved_room_type cancellation_rate avg_stay_length sd_stay_length
## <chr> <dbl> <dbl> <dbl>
## 1 A 0.391 2.29 1.73
## 2 B 0.329 2.44 1.44
## 3 C 0.330 3.37 2.09
## 4 D 0.318 2.99 2.13
## 5 E 0.293 3.41 2.66
## 6 F 0.304 2.68 1.97
## 7 G 0.364 3.09 2.16
## 8 H 0.408 2.93 1.98
## 9 L 0.333 1 0
## 10 P 1 0.917 3.18
August shows the highest average daily rate among all months, while February has the lowest. The data also reveals a significant range in rates within months; for instance, March has an unusually high maximum daily rate compared to other months. This could indicate seasonality in pricing or the presence of special events or premium options in certain periods. Negative minimum daily rates suggests possible data entry issues or promotions that need further investigation.
# Analyzing average daily rates by month
hotel_bookings %>%
group_by(arrival_date_month) %>%
summarise(
average_daily_rate = mean(adr),
max_daily_rate = max(adr),
min_daily_rate = min(adr)
) %>%
arrange(desc(average_daily_rate))
## # A tibble: 12 × 4
## arrival_date_month average_daily_rate max_daily_rate min_daily_rate
## <chr> <dbl> <dbl> <dbl>
## 1 August 140. 450 0
## 2 July 127. 508 0
## 3 June 117. 334. 0
## 4 May 109. 510 0
## 5 September 105. 345 0
## 6 April 100. 336 0
## 7 October 87.9 300 0
## 8 December 81.1 452. 0
## 9 March 80.7 5400 -6.38
## 10 November 73.8 260 0
## 11 February 73.6 284. 0
## 12 January 70.4 284. 0
The cancellation rate for hotel bookings decreases as the number of special requests increases. Bookings with no special requests have the highest cancellation rate at approximately 47.72%, while those with five special requests have a significantly lower cancellation rate of 5%. This trend suggests that bookings with more special requests are less likely to be canceled, potentially indicating a higher commitment level from guests who make such requests.
hotel_bookings %>%
group_by(total_of_special_requests) %>%
summarise(
cancellation_rate = mean(is_canceled)
) %>%
arrange(desc(cancellation_rate))
## # A tibble: 6 × 2
## total_of_special_requests cancellation_rate
## <int> <dbl>
## 1 0 0.477
## 2 2 0.221
## 3 1 0.220
## 4 3 0.179
## 5 4 0.106
## 6 5 0.05
special_requests_cancellation <- hotel_bookings %>%
group_by(total_of_special_requests) %>%
summarise(cancellation_rate = mean(is_canceled)) %>%
arrange(desc(cancellation_rate))
# Create a line plot
ggplot(special_requests_cancellation, aes(x = total_of_special_requests, y = cancellation_rate, group = 1)) +
geom_line() +
geom_point() +
labs(x = "Total of Special Requests", y = "Cancellation Rate", title = "Cancellation Rate by Number of Special Requests") +
theme_minimal()