The dataset contains information on 119,390 bookings made by customers of the Splendor Hotel group from 24th June, 2013 to 31st August, 2017. Descriptive analysis revealed:
#Loading useful packages
library(tidyverse)
library(RColorBrewer)
library(scales)
library(extrafont)
#Setting a theme
theme_set(theme_classic()+
theme(
plot.title = element_text(hjust = 0.5,
family = "Corbel",
size = 20,
face = "bold"
),
plot.subtitle = element_text(hjust = 0.5,
family = "Corbel",
size = 16,
face = "bold"),
plot.caption = element_text(family = "Corbel",
size = 8,
face = "bold"),
axis.title.x = element_text(color = "black",
family = "Corbel",
size = 13,
face = "bold"),
axis.text.x = element_text(size = 11),
axis.title.y = element_text(color = "black",
family = "Corbel",
size = 13,
face = "bold"),
axis.text.y = element_text(size = 11),
panel.grid.major = element_line(color = "grey90")
)
)
#Loading dataset
SHG <- read.csv("SHG_BG.csv")
glimpse(SHG)
#Country of Origin
SHG %>%
mutate(Country = as.factor(Country),
Country = fct_lump_n(Country, 5),
Country = fct_relevel(Country, c("Portugal", "United Kingdom",
"France", "Spain", "Germany", "Other"))) %>%
ggplot(aes(x = fct_rev(Country), fill = Country))+
geom_bar(colour = "black",
linewidth = 1)+
geom_text(aes(label = scales::comma(after_stat(count))),
stat = "count", hjust = 1.1,
colour = "white", fontface = "bold")+
scale_x_discrete(name = "Country")+
scale_y_continuous(name = NULL,
limits = c(0, 50000),
breaks = seq(0, 50000, 10000),
expand = expansion(0))+
scale_fill_manual(values = c("#0576f7", "#f70596", "#f70596",
"#f70596", "#f70596", "#48494a"))+
coord_flip()+
theme(axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid.major = element_blank())+
labs(title = "Almost half of all bookings are from Portugal")
#Number of days stayed
SHG %>%
mutate(Country = as.factor(Country),
Country = fct_lump_n(Country, 5),
Country = fct_relevel(Country, c("Portugal", "United Kingdom",
"France", "Spain", "Germany",
"Other")),
Nights = case_when(
Nights <= 7 ~ "1 week or less",
Nights >7 & Nights <= 30 ~ "Up to a month",
#Nights >14 & Nights <= 30 ~ "Up to a month",
TRUE ~ "More than a month"
),
Nights = as_factor(Nights)) %>%
ggplot(aes(x = Country, fill = Nights))+
geom_bar(colour = "black", linewidth = 1,
position = position_dodge())+
geom_text(aes(label = scales::comma(after_stat(count))),
position = position_dodge(width = 0.9),
stat = "count", vjust = -0.5,
colour = "black", fontface = "bold")+
scale_y_continuous(name = NULL,
limits = c(0, 50000),
breaks = seq(0, 50000, 10000),
expand = expansion(0))+
scale_x_discrete(name = "Country")+
labs(title = "95% of Guests stay a week or less")+
theme(legend.position = "bottom",
legend.direction = "horizontal",
axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Booking Channels
SHG %>%
filter(Distribution.Channel != "Undefined") %>%
group_by(Distribution.Channel) %>%
summarise(Channel.Count = n()) %>%
mutate(Channel.Count = Channel.Count/sum(Channel.Count)) %>%
ggplot(aes(x = " ", y = Channel.Count, fill = Distribution.Channel))+
geom_col(position = "Fill",
colour = "black",
linewidth = 1,
width = 0.6)+
geom_text(aes(label = scales::percent(Channel.Count)),
vjust = 1.5,
size = 4, position = "fill",
colour = "white", fontface = "bold"
)+
scale_x_discrete(name = "Distribution Channel")+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_fill_manual(name = "Distribution Channel",
values = c("#f74e05", "#05f7d3", "#2505f7", "#05f721"))+
labs(title = "Most Passengers booked their reservation through
Online travel agents")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Number of Cancellations
SHG %>%
mutate(Status = case_when(
Status == "Canceled" ~ "Cancelled and No-show",
Status == "No-Show" ~ "Cancelled and No-show",
TRUE ~ "Check-Out")) %>%
group_by(Status) %>%
summarise(Status.Count = n()) %>%
mutate(Status.Count = Status.Count/sum(Status.Count)) %>%
ggplot(aes(x = " ", y = Status.Count, fill = Status))+
geom_col(position = "Fill",
colour = "black",
linewidth = 1,
width = 0.6)+
geom_text(aes(label = scales::percent(Status.Count)),
vjust = 1.5,
size = 4, position = "fill",
colour = "white", fontface = "bold")+
scale_x_discrete(name = "Booking Status")+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_fill_manual(name = "Booking Status",
values = c("#f74e05", "#2505f7"))+
labs(title = "Up to 37% of all reservations were cancelled")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Proportion of cancellations per year
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = year(Booking.Date),
Status = case_when(
Status == "Canceled" ~ "Cancelled and No-show",
Status == "No-Show" ~ "Cancelled and No-show",
TRUE ~ "Check-Out")) %>%
filter(Booking.Year %in% c(2015, 2016, 2017)) %>%
group_by(Status, Booking.Year) %>%
summarise(Status.Count = n()) %>%
ungroup() %>%
group_by(Booking.Year) %>%
mutate(Status.Count = Status.Count/sum(Status.Count)) %>%
ggplot(aes(x = Booking.Year, y = Status.Count, fill = Status))+
geom_col(position = "Fill",
colour = "black",
linewidth = 1,
width = 0.6)+
geom_text(aes(label = scales::percent(Status.Count)),
vjust = 1.5,
size = 4, position = "fill",
colour = "white", fontface = "bold")+
scale_x_continuous(name = "Year",
limits = c(2014.5, 2017.5),
breaks = seq(2015, 2017, 1))+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_fill_manual(name = "Booking Status",
values = c("#f74e05", "#2505f7"))+
labs(title = "Over 30% of all booked guests don't show up")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Proportion of cancellations per month Improve later
#1
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = year(Booking.Date)) %>%
filter(Booking.Year %in% c(2015, 2016, 2017)) %>%
mutate(Booking.Month = month(Booking.Date, label = TRUE),
Status = case_when(
Status == "Canceled" ~ "Cancelled and No-show",
Status == "No-Show" ~ "Cancelled and No-show",
TRUE ~ "Check-Out")) %>%
group_by(Status, Booking.Month, Booking.Year) %>%
summarise(Status.Count = n()) %>%
ungroup() %>%
group_by(Booking.Month, Booking.Year) %>%
mutate(Status.Count = round(Status.Count/sum(Status.Count), 3)) %>%
ggplot(aes(x = Booking.Month, y = Status.Count, fill = Status))+
geom_col(position = "Fill",
colour = "black",
linewidth = 1,
width = 0.6)+
geom_text(aes(label = scales::percent(Status.Count)),
vjust = 1.5,
size = 3.5, position = "fill",
colour = "white", fontface = "bold")+
scale_x_discrete(name = "Month")+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_fill_manual(name = "Booking Status",
values = c("#f74e05", "#2505f7"))+
labs(title = "Over 30% of all booked guests don't show up")+
theme(legend.position = "top",
axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())+
facet_wrap(~Booking.Year, nrow = 3)
#2
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = year(Booking.Date)) %>%
filter(Booking.Year %in% c(2015, 2016, 2017)) %>%
mutate(Booking.Month = month(Booking.Date, label = TRUE),
Status = case_when(
Status == "Canceled" ~ "Cancelled and No-show",
Status == "No-Show" ~ "Cancelled and No-show",
TRUE ~ "Check-Out")) %>%
group_by(Status, Booking.Month, Booking.Year) %>%
summarise(Status.Count = n()) %>%
ungroup() %>%
group_by(Booking.Month, Booking.Year) %>%
mutate(Status.Count = round(Status.Count/sum(Status.Count), 3)) %>%
ggplot(aes(x = Booking.Month, y = Status.Count, fill = Status))+
geom_col(position = "Fill",
colour = "black",
linewidth = 1,
width = 0.6)+
geom_text(aes(label = scales::percent(Status.Count)),
vjust = 1.5,
size = 3.5, position = "fill",
colour = "white", fontface = "bold")+
scale_x_discrete(name = "Month")+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_fill_manual(name = "Booking Status",
values = c("#f74e05", "#2505f7"))+
labs(title = "Over 30% of all booked guests don't show up")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())+
facet_wrap(~Booking.Year, nrow = 3)
#Average daily spending per customer
SHG %>%
ggplot(aes(x = Distribution.Channel, y = Avg.Daily.Rate))+
geom_boxplot()
#Revenue Loss
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = year(Booking.Date)) %>%
pivot_longer(cols = c(Revenue, Revenue.Loss),
names_to = "Income",
values_to = "Income_Amount") %>%
group_by(Booking.Year, Income) %>%
summarise(Income_Amount = sum(Income_Amount)) %>%
ggplot(aes(x = Booking.Year, y = Income_Amount, fill = Income))+
geom_col(colour = "black",
linewidth = 1)+
geom_text(aes(label = scales::number(Income_Amount,
scale = 1e-6,
suffix = "M",
prefix = "$")),
position = "Stack",
vjust = 1.2,
colour = "white",
fontface = "bold",
size = 3.5)+
scale_x_continuous(name = "Booking Year",
limits = c(2013.5, 2017.5),
breaks = seq(2014, 2017, 1))+
scale_y_continuous(name = "Income and Loss",
labels = label_number(suffix = "M",
prefix = "$",
scale = 1e-6),
limits = c(-8e6, 14e6),
expand = expansion(0))+
scale_fill_manual(labels =c("Revenue", "Revenue Loss"),
values = c("#0576f7", "#f74e05"))+
labs(title = "High number of cancellations are leading to significant
revenue loss each year")
#Revenue generated each year
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = year(Booking.Date)) %>%
group_by(Booking.Year) %>%
summarise(Revenue = sum(Revenue)) %>%
ggplot(aes(x = Booking.Year, y = Revenue))+
geom_point(size = 3,
colour = "#0576f7")+
geom_line(linewidth = 1.2,
colour = "#0576f7",
alpha = 0.8)+
scale_y_continuous(name = "Revenue",
labels = label_number(suffix = "M",
prefix = "$",
scale = 1e-6),
limits = c(0e6, 15e6),
breaks = seq(0e6, 15e6, 5e6))+
scale_x_continuous(name = "Year")+
labs(title = "Total Revenue generated each year")
#Revenue generated each month
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = format_ISO8601(Booking.Date, precision = "ym"),
Booking.Year = ym(Booking.Year)) %>%
group_by(Booking.Year) %>%
summarise(Revenue = sum(Revenue)) %>%
ggplot(aes(x = Booking.Year, y = Revenue))+
geom_point(size = 3,
colour = "#0576f7")+
geom_line(linewidth = 1.2,
colour = "#0576f7",
alpha = 0.8)+
geom_vline(linetype = "dashed",
colour = "black",
xintercept = c(as.Date("2016-01-01"),
as.Date("2017-01-01")))+
scale_x_date(name = "Year",
date_breaks = "6 months",
date_labels = "%b%Y")+
scale_y_continuous(name = "Revenue",
labels = label_number(suffix = "M",
prefix = "$",
scale = 1e-6),
limits = c(0e6, 2.5e6),
breaks = seq(0e6, 2e6, 0.5e6))+
labs(title = "Revenue spikes recorded in January")
#Revenue and loss each year
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = year(Booking.Date)) %>%
pivot_longer(cols = c(Revenue, Revenue.Loss),
names_to = "Income",
values_to = "Income_Amount") %>%
group_by(Booking.Year, Income) %>%
summarise(Income_Amount = sum(Income_Amount)) %>%
ggplot(aes(x = Booking.Year, y = Income_Amount, colour = Income))+
geom_point(size = 3)+
geom_line(linewidth = 1.2,
alpha = 0.8)+
scale_colour_manual(labels =c("Revenue", "Revenue Loss"),
values = c("#0576f7", "#f74e05"))+
scale_y_continuous(name = "Revenue",
labels = label_number(suffix = "M",
prefix = "$",
scale = 1e-6),
limits = c(-8e6, 14e6),
breaks = seq(-10e6, 15e6, 5e6))+
scale_x_continuous(name = "Year")+
labs(title = "Higher revenue generation is mirrored by a larger
share of lost revenue from cancellations")
#Online vs Offline travel agents
SHG %>%
filter(Distribution.Channel %in% c("Online Travel Agent",
"Offline Travel Agent")) %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = format_ISO8601(Booking.Date, precision = "ym"),
Booking.Year = ym(Booking.Year)) %>%
group_by(Booking.Year, Distribution.Channel) %>%
summarise(Revenue = sum(Revenue)) %>%
ggplot(aes(x = Booking.Year, y = Revenue, colour = Distribution.Channel))+
geom_point(size = 3)+
geom_line(linewidth = 1.2,
alpha = 0.8)+
scale_x_date(date_breaks = "6 months",
date_labels = "%b%Y")+
scale_y_continuous(name = "Revenue",
labels = label_number(suffix = "M",
prefix = "$",
scale = 1e-6),
limits = c(0e6, 1.5e6),
breaks = seq(0e6, 1.5e6, 0.25e6))+
scale_colour_manual(values = c("#2505f7", "#05f721"))+
labs(title = "Online Travel agents generated")
#Calculation of Revenue contribution
SHG %>%
filter(Distribution.Channel %in% c("Online Travel Agent",
"Offline Travel Agent")) %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = year(Booking.Date)) %>%
filter(Booking.Year %in% c(2015, 2016, 2017)) %>%
group_by(Booking.Year, Distribution.Channel) %>%
summarise(Revenue = sum(Revenue)) %>%
ggplot(aes(x = Booking.Year, y = Revenue, fill = Distribution.Channel))+
geom_col(colour = "black",
position = position_dodge())+
geom_text(aes(label = scales::number(Revenue,
scale = 1e-6,
suffix = "M",
prefix = "$")),
position = position_dodge(width = 0.9),
vjust = 1.2,
colour = "white",
fontface = "bold",
size = 4)+
scale_x_continuous(name = "Year")+
scale_y_continuous(name = NULL,
labels = label_number(suffix = "M",
prefix = "$",
scale = 1e-6),
limits = c(0e6, 9e6),
breaks = seq(0e6, 9e6, 3e6),
expand = expansion(0))+
scale_fill_manual(name = "Distribution Channel",
values = c("#2505f7", "#05f721"))+
labs(title = "From 2015 to 2017, Online agents generated 2.5 times
more revenue than Offline agents")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Number of bookings per month
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = format_ISO8601(Booking.Date, precision = "ym"),
Booking.Year = ym(Booking.Year)) %>%
group_by(Booking.Year) %>%
summarise(Booking.ID = n()) %>%
ggplot(aes(x = Booking.Year, y = Booking.ID))+
geom_point(size = 3)+
geom_line(linewidth = 1.2,
alpha = 0.8)+
geom_vline(linetype = "dashed",
colour = "black",
xintercept = c(as.Date("2016-01-01"),
as.Date("2017-01-01")))+
scale_x_date(name = "Month and Year",
date_breaks = "6 months",
date_labels = "%b%Y")+
scale_y_continuous(name = "Bookings")+
labs(title = "Revenue spike corresponds with spike in bookings")
#Number of bookings per hotel per month
SHG %>%
mutate(Booking.Date = dmy(Booking.Date),
Booking.Year = format_ISO8601(Booking.Date, precision = "ym"),
Booking.Year = ym(Booking.Year)) %>%
group_by(Booking.Year, Hotel) %>%
summarise(Booking.ID = n()) %>%
ggplot(aes(x = Booking.Year, y = Booking.ID, colour = Hotel))+
geom_point(size = 3)+
geom_line(linewidth = 1.2,
alpha = 0.8)+
scale_x_date(date_breaks = "6 months",
date_labels = "%b%Y")