Data is from https://www.kaggle.com/jessemostipak/hotel-booking-demand We’ll do some analysis on guest reservation cancellation behaviour and build a random forest model to predict whether guests would cancel their reservations.
library(tidyverse)
library(caret)
library(pROC)
library(zoo)
library(corrplot)
library(ROCR)
options(dplyr.summarise.inform = FALSE)
data <- read.csv("hotel_bookings.csv")
str(data)
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr "July" "July" "July" "July" ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr "BB" "BB" "BB" "BB" ...
## $ country : chr "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr "C" "C" "A" "A" ...
## $ assigned_room_type : chr "C" "C" "C" "A" ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr "NULL" "NULL" "NULL" "304" ...
## $ company : chr "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
summary(data)
## hotel is_canceled lead_time arrival_date_year
## Length:119390 Min. :0.0000 Min. : 0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:119390 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.:16.00 1st Qu.: 8.0
## Mode :character Median :28.00 Median :16.0
## Mean :27.17 Mean :15.8
## 3rd Qu.:38.00 3rd Qu.:23.0
## Max. :53.00 Max. :31.0
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 Length:119390 Length:119390
## 1st Qu.: 0.0000 1st Qu.: 0.000000 Class :character Class :character
## Median : 0.0000 Median : 0.000000 Mode :character Mode :character
## Mean : 0.1039 Mean : 0.007949
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000
## Max. :10.0000 Max. :10.000000
## NA's :4
## market_segment distribution_channel is_repeated_guest
## Length:119390 Length:119390 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000
## Mean :0.03191
## 3rd Qu.:0.00000
## Max. :1.00000
##
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 Length:119390
## 1st Qu.: 0.00000 1st Qu.: 0.0000 Class :character
## Median : 0.00000 Median : 0.0000 Mode :character
## Mean : 0.08712 Mean : 0.1371
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :26.00000 Max. :72.0000
##
## assigned_room_type booking_changes deposit_type agent
## Length:119390 Min. : 0.0000 Length:119390 Length:119390
## Class :character 1st Qu.: 0.0000 Class :character Class :character
## Mode :character Median : 0.0000 Mode :character Mode :character
## Mean : 0.2211
## 3rd Qu.: 0.0000
## Max. :21.0000
##
## company days_in_waiting_list customer_type adr
## Length:119390 Min. : 0.000 Length:119390 Min. : -6.38
## Class :character 1st Qu.: 0.000 Class :character 1st Qu.: 69.29
## Mode :character Median : 0.000 Mode :character Median : 94.58
## Mean : 2.321 Mean : 101.83
## 3rd Qu.: 0.000 3rd Qu.: 126.00
## Max. :391.000 Max. :5400.00
##
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:119390
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :0.0000 Mode :character
## Mean :0.06252 Mean :0.5714
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :8.00000 Max. :5.0000
##
## reservation_status_date
## Length:119390
## Class :character
## Mode :character
##
##
##
##
Features of the dataset
Hotel: resort (H1) or city hotel (H2). Both hotels are located in Portugal: H1, a resort in Algarve, H2 a city hotel in Lisbon
is_canceled: 1=cancelled, 0=not cancelled
lead_time: no. of days elapsed between entering data of booking and the arrival date
arrival_date_year: year of arrival date (2015-2017)
arrival_date_month: month of arrival date (Jan-Dec)
arrival_date_week_number: week number of year of arrival date (1-53)
arrival_date_day_of_month: day of arrival date (1-31)
stays_in_weekend_nights: no. of Saturday/Sundays guest stayed or booked to stay
stays_in_week_nights: no. of week nights (Monday-Friday) guest stayed or booked to stay
adults: no. of adults
children: no. of children
babies: no. of babies
meal: type of meal booked (BB: bed & breakfast, HB: half board, FB: full board etc)
country: country of origin in ISO (3 letters)
market_segment: market segment designation (Direct, TA: travel agents, TO: tour operators etc.)
distribution_channel: booking distribution channek (TA: travel agents, TO: tour operators etc.)
is_repeated_guest: 1=from repeated guest, 0=not repeated guest
previous_cancellations: no. of previous bookings that were cancelled prior to current booking
previous_bookings_not_canceled
reserved_room_type: A-D
assigned_room_type: A-D
booking_changes: no. of changes made to booking from the moment the booking was entered
deposit_type: if the customer made a deposit to guarantee the booking
agent: ID of the travel agency that made the booking
company: ID of the company that made the booking or responsible for paying
days_in_waiting_list: no. of days the booking was in waiting list before confirmed to the customer
customer_type: type of booking (transient, transient-party, contract etc)
adr: average daily rate (sum of all lodging transactions/total no. of staying nights)
required_car_park: no. of car parking spaces required by customer
total_of_special_requests: no. of special requests made (e.g. twin bed, high floor etc)
reservation_status: canceled, check-out etc
reservation_status_date
which(is.na(data$children))
## [1] 40601 40668 40680 41161
data[c(40601, 40668, 40680, 41161),]
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 40601 City Hotel 1 2 2015 August
## 40668 City Hotel 1 1 2015 August
## 40680 City Hotel 1 1 2015 August
## 41161 City Hotel 1 8 2015 August
## arrival_date_week_number arrival_date_day_of_month
## 40601 32 3
## 40668 32 5
## 40680 32 5
## 41161 33 13
## stays_in_weekend_nights stays_in_week_nights adults children babies meal
## 40601 1 0 2 NA 0 BB
## 40668 0 2 2 NA 0 BB
## 40680 0 2 3 NA 0 BB
## 41161 2 5 2 NA 0 BB
## country market_segment distribution_channel is_repeated_guest
## 40601 PRT Undefined Undefined 0
## 40668 PRT Direct Undefined 0
## 40680 PRT Undefined Undefined 0
## 41161 PRT Online TA Undefined 0
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## 40601 0 0 B
## 40668 0 0 B
## 40680 0 0 B
## 41161 0 0 B
## assigned_room_type booking_changes deposit_type agent company
## 40601 B 0 No Deposit NULL NULL
## 40668 B 0 No Deposit 14 NULL
## 40680 B 0 No Deposit NULL NULL
## 41161 B 0 No Deposit 9 NULL
## days_in_waiting_list customer_type adr required_car_parking_spaces
## 40601 0 Transient-Party 12.0 0
## 40668 0 Transient-Party 12.0 0
## 40680 0 Transient-Party 18.0 0
## 41161 0 Transient-Party 76.5 0
## total_of_special_requests reservation_status reservation_status_date
## 40601 1 Canceled 2015-08-01
## 40668 1 Canceled 2015-08-04
## 40680 2 Canceled 2015-08-04
## 41161 1 Canceled 2015-08-09
There are 4 missing values in the data.
data$country[data$country == "CN"] <- "CHN"
Some entries of China were recorded as “CN” rather than “CHN”.
Which countries booked the most?
booking_stats <- data %>%
group_by(country) %>%
summarize(total_bookings = n()) %>%
arrange(desc(total_bookings))
iso_continent <- read.csv("iso_continent.csv")
iso_continent <- iso_continent[, c("Continent_Name", "Three_Letter_Country_Code")]
colnames(iso_continent) = c("continent", "country")
booking_stats <- left_join(booking_stats, iso_continent, by = "country")
ggplot(booking_stats[1:20,], aes(x = reorder(country, -total_bookings), y = total_bookings, fill = continent, color = continent)) +
geom_bar(stat="identity", width = 0.75) +
geom_text(aes(label = total_bookings), position = position_dodge(width=1), vjust = -0.25, size = 2.4) +
ggtitle("Top 20 booking countries") +
xlab("Country") + ylab("Total bookings") +
theme(axis.text.x = element_text(angle = 90),
plot.title = element_text(hjust = 0.5)) +
scale_fill_manual("Continent", values = c("#58508d", "#bc5090", "#ff6361", "#ffa600")) +
scale_colour_manual("Continent", values = c("#58508d", "#bc5090", "#ff6361", "#ffa600"))
Number of bookings the city hotel and resort received respectively
data %>%
select(hotel, stays_in_week_nights, stays_in_weekend_nights) %>%
mutate(total_stay = stays_in_week_nights + stays_in_weekend_nights) %>%
ggplot(aes(x=hotel, y = total_stay, color = hotel)) + geom_boxplot() +
ggtitle("How long guests stay at the two hotels?") +
xlab("Hotel") + ylab("Length of stay per guest in days") +
scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
theme(plot.title = element_text(hjust = 0.5))
How long in advance did people make reservations?
data %>%
select(hotel, stays_in_week_nights, stays_in_weekend_nights) %>%
mutate(total_stay = stays_in_week_nights + stays_in_weekend_nights) %>%
ggplot(aes(x=total_stay, color = hotel, fill = hotel, alpha = 0.1)) + geom_density() +
scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
scale_fill_manual("Hotel", values = c("#142459", "#1AC9E6")) +
ggtitle("Distribution of lengths of stay per guest (days)") +
xlab("Length of stay per guest in days") +
ylab("Density") +
guides(alpha=FALSE) +
theme(plot.title = element_text(hjust = 0.5))
So, people tend to stay longer at the resort than at the city hotel.
Do peoople from different nationalities have different preferences in booking resorts and city hotels?
top_10 <- booking_stats$country[1:10]
country_hotel_bookings <- data %>%
select(hotel, country) %>%
group_by(country, hotel) %>%
summarize(bookings = n()) %>%
filter(country %in% top_10)
ggplot(country_hotel_bookings, aes(x=reorder(country, -bookings), y = bookings, fill = hotel)) +
geom_bar(stat="identity", position = "dodge", width = 0.75) +
geom_text(aes(label = bookings), position = position_dodge(width=1), hjust = -0.1, size = 3, angle = 90) +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("City hotel and resort bookings breakdown for top 10 countries") +
xlab("Country") + ylab("Bookings") +
lims(y=c(0, 35000)) +
scale_fill_manual("Hotel", values = c("#142459", "#1AC9E6")) +
theme(plot.title = element_text(hjust = 0.5))
Among the top 10 visiting countries, it seems that more guests from continental Europe, China and Brazil booked for the city hotel whereas guests from the UK and Ireland were more likely to stay at the resort hotel.
How long did guests stay?
country_hotel_length <- data %>%
select(hotel, country, stays_in_week_nights, stays_in_weekend_nights) %>%
mutate(len_stay = stays_in_weekend_nights + stays_in_week_nights) %>%
filter(country %in% top_10)
medians <- country_hotel_length %>%
group_by(hotel) %>%
summarize(med = median(len_stay))
ggplot(country_hotel_length, aes(x=hotel, y=len_stay, color = hotel)) +
geom_boxplot() + ggtitle("How long do people stay in city hotels and resorts?") +
xlab("Hotel") + ylab("Length of stay (days)") +
scale_x_discrete(labels = c('City\nHotel', 'Resort\nHotel')) +
scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
geom_text(data = medians, aes(x = hotel, y = med, label = med, color = hotel), hjust = 13, show.legend = FALSE) + theme(plot.title = element_text(hjust = 0.5))
On average, people tended to stay longer at the resort hotel.
How long did guests from different countries stay?
ggplot(country_hotel_length, aes(x=hotel, y=len_stay, color = hotel)) +
geom_boxplot() + facet_wrap(~country) +
ggtitle("Length of stays per guest (in days) from different countries") +
xlab("Hotel") + ylab("Length of stays per guest (in days)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
scale_x_discrete(labels = c('City\nHotel', 'Resort\nHotel'))
ggplot(country_hotel_length, aes(x = len_stay, color = hotel, fill = hotel, alpha = 0.05)) +
geom_density() + facet_wrap(~country) +
ggtitle("Length of stays per guest (in days) from different countries") +
xlab("Hotel") + ylab("Density") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
scale_fill_manual("Hotel", values = c("#142459", "#1AC9E6")) +
guides(alpha=FALSE)
We can also see that for each country, more people booked at the city hotel compared to the resort, but stay in the resort for longer.
Did guests from different countries book differently?
data%>%
select(hotel, country, customer_type) %>%
filter(country %in% top_10) %>%
ggplot(aes(x=hotel, fill=customer_type)) +
geom_bar(position="dodge") +
facet_wrap(~country, scales = "free") +
scale_fill_manual("Customer type", values = c("#003f5c", "#58508d", "#ff6361", "#ffa600")) +
scale_x_discrete(labels = c('City\nHotel', 'Resort\nHotel')) +
theme(legend.key.size = unit(0.2, 'cm')) +
ggtitle("Did guests from different countries book differently?") +
xlab("Hotel") + ylab("Bookings") +
theme(plot.title = element_text(hjust = 0.5))
It seems that there were much more contracted bookings from the UK and Ireland at the resort hotel compared to other countries.
Mean daily price of hotels
data$arrival_date <- as.Date(paste(data$arrival_date_year, data$arrival_date_month, data$arrival_date_day_of_month,sep = "-"), "%Y-%B-%d")
data %>%
select(hotel, arrival_date, adr) %>%
group_by(hotel, arrival_date) %>%
summarize(mean_price = mean(adr)) %>%
ggplot(aes(x=arrival_date, y = mean_price, color = hotel)) + geom_line() +
ggtitle("Daily rate of hotels") +
xlab("Arrival date") +
ylab("Mean daily rate across all types of room") +
scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
theme(plot.title = element_text(hjust = 0.5))
Rates were higher during summer months and lower during winter months. Difference in prices between summer and winter were more pronounced for the resort.
How long in advance did people book?
data %>%
select(hotel, arrival_date, lead_time) %>%
group_by(hotel, arrival_date) %>%
summarize(median_lead_time = median(lead_time)) %>%
ggplot(aes(x=arrival_date, y = median_lead_time, color = hotel)) +
geom_line() + ggtitle("Median lead time for arrivals on each day") +
xlab("Arrival date") + ylab("Median lead time") +
scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
theme(plot.title = element_text(hjust = 0.5))
Guests tended to book more in advance for stays in summers, booking pattern was quite similar for both hotels.
We want to study cancellation of hotel reservations. So let’s do some EDA for that.
ggplot(data, aes(x=reservation_status, fill = reservation_status)) + geom_bar() +
geom_text(stat = "count", aes(label = ..count.., color = reservation_status), vjust = -0.2, size = 3.5) +
scale_fill_manual("Reservation Status", values=c("#C70039", "#16A085", "#2980B9")) +
scale_color_manual("Reservation Status", values=c("#C70039", "#16A085", "#2980B9")) +
xlab("Reservation Status") + ylab("Counts") + ggtitle("Breakdown of reservation status") +
theme(plot.title = element_text(hjust = 0.5))
36.5% of the reservations were canceled and 1% were no-show.
Cancellation breakdown for each hotel
proportions <- data %>%
group_by(hotel, reservation_status) %>%
summarize(counts = n()) %>%
mutate(proportions = round(counts/sum(counts)*100,2))
ggplot(data, aes(x=reservation_status, fill = reservation_status)) + geom_bar() +
facet_wrap(~hotel)+
geom_text(data = proportions, aes(x=reservation_status, y=counts, label =str_c(proportions,'%'), color = reservation_status), vjust = -0.2, size = 3.5) +
scale_fill_manual("Reservation status", values=c("#C70039", "#16A085", "#2980B9")) +
scale_color_manual("Reservation status", values=c("#C70039", "#16A085", "#2980B9")) +
xlab("Reservation Status") + ylab("Counts") + ggtitle("Breakdown of reservation status") +
theme(plot.title = element_text(hjust = 0.5))
If guests have paid deposit, would they be less likely to cancel their reservation?
data %>%
select(reservation_status, deposit_type) %>%
table() %>%
prop.table()
## deposit_type
## reservation_status No Deposit Non Refund Refundable
## Canceled 2.388977e-01 1.211157e-01 2.931569e-04
## Check-Out 6.277494e-01 7.789597e-04 1.055365e-03
## No-Show 9.816568e-03 2.847810e-04 8.375911e-06
Indeed, people who paid no deposit were more likely to cancel than those who have paid deposit (most deposits are non-refundable). However, even if someone has paid no deposit, it’s still more likely that they would not cancel.
Do higher prices make guests more likely to cancel their reservation, maybe they have found better prices elsewhere?
data$is_canceled <- as.factor(data$is_canceled)
data %>%
select(hotel, is_canceled, adr, reservation_status) %>%
ggplot(aes(x=reservation_status, y = adr, color = reservation_status)) + geom_boxplot()+
facet_wrap(~hotel) +
lims(y=c(0, 600)) +
scale_color_manual("Reservation status", values=c("#C70039", "#16A085", "#2980B9")) +
ggtitle("Distribution of average rate per night \n that ended up as check-in/cancelled/no-show") + xlab("Reservation status") + ylab("Average daily rate")
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
theme(plot.title = element_text(hjust = 0.5))
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
Prices seemed to have a small effect on whether people would cancel or not show up for the city hotel. The effect was more pronounced for the resort, lower prices may have led to more people not showing up.
Time trend of cancellations and no-show
data%>%
select(hotel, reservation_status, arrival_date) %>%
mutate(arrival_month = as.yearmon(arrival_date, "%B %Y")) %>%
group_by(hotel, reservation_status, arrival_month) %>%
summarize(count = n()) %>%
ggplot(aes(x = arrival_month, y = count, fill = reservation_status)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90)) +
scale_fill_manual("Reservation status", values=c("#C70039", "#16A085", "#2980B9")) +
ggtitle("Trend of cancellations and no-show") + xlab("Arrival date") +
ylab("Bookings") +
facet_wrap(~hotel) +
theme(plot.title = element_text(hjust = 0.5))
They largely followed the booking trend, in summer months when more people booked, there were also more cancellations.
Special requests
table(data$total_of_special_requests)
##
## 0 1 2 3 4 5
## 70318 33226 12969 2497 340 40
Was there a correlation between special requests and cancellation?
data %>%
ggplot(aes(x = total_of_special_requests, fill = reservation_status)) +
geom_bar() +
scale_fill_manual("Reservation status", values=c("#C70039", "#16A085", "#2980B9")) +
ggtitle("How many people with special requests cancelled booking or did not show up?") +
xlab("Number of special requests") +
ylab("Number of bookings") +
theme(plot.title = element_text(hjust = 0.5, size = 12))
It seems like people with special requests tend to not cancel/no show. Perhaps hotels can prepare more facilities to cater for people with special requests.
Lead time and cancellation
data %>%
select(hotel, lead_time, reservation_status) %>%
ggplot(aes(x=reservation_status, y = lead_time, color = hotel)) +
geom_boxplot() +scale_color_manual("Hotel", values = c("#142459", "#1AC9E6")) +
xlab("Reservation status") + ylab("Lead time (days)") +
ggtitle("Distribution of booking lead time for each reservation status") +
theme(plot.title = element_text(hjust = 0.5))
Were guests who have canceled previously more likely to cancel their reservation again?
data %>%
select(hotel, previous_cancellations, reservation_status) %>%
ggplot(aes(x=reservation_status, y=previous_cancellations, color = reservation_status)) + geom_boxplot() +
scale_fill_manual("Reservation status", values=c("#C70039", "#16A085", "#2980B9")) +
ggtitle("Distribution of number of previous cancellations each customer made") +
xlab("Reservation status") + ylab("Number of previous cancellations each customer made") +
theme(plot.title = element_text(hjust = 0.5))
Did different nationalities have different tendencies when it comes to canceling reservations? We’ll look at the top 30 booking countries.
country_status <- data %>%
group_by(country, reservation_status) %>%
summarize(counts = n()) %>%
spread(key = reservation_status, value = counts) %>%
mutate(Total = Canceled + `Check-Out` + `No-Show`) %>%
arrange(desc(Total)) %>%
head(30) %>%
mutate(Canceled = Canceled/Total) %>%
mutate(`No-Show` = `No-Show`/Total) %>%
mutate(`Check-Out` = `Check-Out`/Total) %>%
select(-Total) %>%
gather(key = reservation_status, proportion, Canceled:`No-Show`)
ggplot(country_status,
aes(x=country, y = proportion, fill = reservation_status))+
geom_bar(stat ="identity") +
scale_fill_manual("Reservation status", values=c("#C70039", "#16A085", "#2980B9")) +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Countries cancelling reservations and not showing up") +
xlab("Country") + ylab("Proportion of bookings") +
theme(plot.title = element_text(hjust = 0.5))
Looks like Portuguese, Angolese and Moroccans had the largest cancellation percentage. There were 47039 Portuguese bookings (55.1% canceled/no-show), 355 Angolese bookings (39% canceled/no-show) and 256 Morrocan bookings (39% canceled/no-show). Cancellation/no-show proportions were also high for Turkish (38.7%), Russians (36.7%), Luxembourgers (36.6%), Brazilians (35.9%) and Italians (34.9%).
So far we’ve addressed relationships between reservation status and deposit, price, arrival month, number of special requests, lead time, people with cancellation history.
Now we’ll clean some data and investigate predictor correlation.
boxplot(data$adr)
title("Distribution of average daily rate")
“adr” has some negative values and outlier (5400), these will be removed.
data <- data %>%
filter(adr > 0) %>%
filter(adr < 5400)
data$is_canceled <- as.factor(data$is_canceled)
data$arrival_date_month <- as.factor(data$arrival_date_month)
data$meal <- as.factor(data$meal)
data$country <- as.factor(data$country)
data$market_segment <- as.factor(data$market_segment)
data$distribution_channel <- as.factor(data$distribution_channel)
data$reserved_room_type <- as.factor(data$reserved_room_type)
data$assigned_room_type <- as.factor(data$assigned_room_type)
data$deposit_type <- as.factor(data$deposit_type)
data$customer_type <- as.factor(data$customer_type)
data$reservation_status <- as.factor(data$reservation_status)
data$arrival_date <- as.Date(data$arrival_date)
agent, company will be dropped
data$agent <- NULL
data$company <- NULL
numeric <- data %>% select(where(is.numeric))
numeric <- numeric %>% rename("arr_year" = "arrival_date_year") %>%
rename("arr_week" = "arrival_date_week_number") %>%
rename("arr_dom" = "arrival_date_day_of_month") %>%
rename("stay_weekends" = "stays_in_weekend_nights") %>%
rename("stay_weekdays" = "stays_in_week_nights") %>%
rename("returned_guest" = "is_repeated_guest") %>%
rename("past_cancel" = "previous_cancellations") %>%
rename("past_notcanc" = "previous_bookings_not_canceled") %>%
rename("book_change" = "booking_changes") %>%
rename("waiting_days" = "days_in_waiting_list") %>%
rename("park_spaces" = "required_car_parking_spaces") %>%
rename("spec_request" = "total_of_special_requests")%>%
cbind("is_canceled" = as.integer(data$is_canceled))
corrplot(cor(numeric, use="complete.obs"), method = "color", tl.cex=0.75,
tl.col = "black")
data$hotel <- as.factor(data$hotel)
There are many countries in the dataset, so we’ll group all those with total bookings less than 20 as “Others”.
other_countries <- booking_stats$country[booking_stats$total_bookings < 20]
data$country <- as.character(data$country)
data$country[data$country %in% other_countries] <- "Others"
data$country <- as.factor(data$country)
Missing data in “children” will be imputed using the mode, Knnimpute and bagImpute will be much more computationally expensive.
data$children[is.na(data$children)] <- mode(data$children)
data$children <- as.factor(data$children)
The dataset is quite big and the model may be too slow to trained, so for this case, we’ll just consider data from summer 2016 to summer 2017.
data_2017 <- data %>%
filter(reservation_status_date > "2016-07-01" & reservation_status_date < "2017-07-01")
data_2017$reservation_status_date <- NULL
data_2017 <- data_2017[, c(1:28)]
Train/test partition
8% of the data was used for training the model, this is a very small proportion. 8% was chosen because it took too long to train the model. Parallel programming (if hardware allows) or running it on cloud would be helpful.
set.seed(1234)
train_index <- createDataPartition(data_2017$is_canceled, p= 0.08, list = FALSE)
train <- data_2017[train_index,]
test <- data_2017[-train_index,]
A random forest algorithm will be used. We’ll load the random forest model that has been trained previously using following code:
#myControl <- trainControl(method = "repeatedcv", number = 4, repeats = 2)
#model_rf <- train(is_canceled ~ ., data = train, method = 'rf',
# trainControl = myControl, tuneLength = 4)
model_rf <- readRDS("./hotel_model_rf_actual.rds")
#model_rf$finalModel
Checking if the random forest converged.
model_rf_oob_error_rate <- data.frame(Trees = rep(1:nrow(model_rf$finalModel$err.rate), times = 3),
Type = rep(c("OOB", "Not Canceled", "Canceled"), each =nrow(model_rf$finalModel$err.rate)),
Error = c(model_rf$finalModel$err.rate[,"OOB"],
model_rf$finalModel$err.rate[,"0"],
model_rf$finalModel$err.rate[,"1"]))
head(model_rf_oob_error_rate)
## Trees Type Error
## 1 1 OOB 0.2597715
## 2 2 OOB 0.2570590
## 3 3 OOB 0.2525667
## 4 4 OOB 0.2457098
## 5 5 OOB 0.2397361
## 6 6 OOB 0.2320507
ggplot(model_rf_oob_error_rate, aes(x=Trees, y=Error, color = Type))+
geom_line() + ggtitle("Has the forest converged?") +
theme(plot.title = element_text(hjust = 0.5))
The forest did converge. The OOB error is 17.29%.
auc <- roc(train$is_canceled, model_rf$finalModel$votes[,"1"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
train_with_predict <- train
train_with_predict$arrival_date <- as.Date(paste(train_with_predict$arrival_date_year, train_with_predict$arrival_date_month, train_with_predict$arrival_date_day_of_month,sep = "-"), "%Y-%B-%d")
compared <- train_with_predict %>%
select(arrival_date, is_canceled) %>%
mutate(predicted = model_rf$finalModel$predicted)%>%
group_by(arrival_date) %>%
summarize(actual = sum(as.integer(is_canceled)), predicted = sum(as.integer(predicted)))
Comparison between actual data and out-of-bag prediction
ggplot(compared, aes(x=arrival_date)) + geom_line(aes(y = actual, color = "blue")) +
geom_line(aes(y=predicted, color = "red")) +ggtitle("Actual data vs. random forest OOB prediction") +
xlab("Arrival date") + ylab("Number of cancellations per day") +
scale_color_discrete(name = "Values", labels = c("Actual", "Predicted"))
preds <- predict(model_rf, test)
confusionMatrix(test$is_canceled, preds)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 30679 2743
## 1 5743 12635
##
## Accuracy : 0.8362
## 95% CI : (0.833, 0.8394)
## No Information Rate : 0.7031
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6285
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8423
## Specificity : 0.8216
## Pos Pred Value : 0.9179
## Neg Pred Value : 0.6875
## Prevalence : 0.7031
## Detection Rate : 0.5923
## Detection Prevalence : 0.6452
## Balanced Accuracy : 0.8320
##
## 'Positive' Class : 0
##
The model has an accuracy of 83.6% on the test set. Specificity is 82.16% and sensitivity is 84.23%. Ideally, the model should be tuned (using a bigger train set or longer tuneLength) to improve specificity, to lower the chance that we incorrectly identify a check-in as a cancellation. Other algorithms can also be tried.