library(tidyverse)

hotels_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv')

From the source of the dataset, there are actually two different distributions at play here. The data collected of non-cancelled bookings is different than the data collected from cancelled bookings. Thus we will only consider the non-cancelled data, which has much more accurate customer information.

hotels = hotels_raw %>%
  filter(is_canceled == 0) %>%
  na.omit()

skimr::skim(hotels)

Data summary
Name	hotels
Number of rows	75166
Number of columns	32
_______________________
Column type frequency:
character	13
Date	1
numeric	18
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
hotel	1	10	12	2
arrival_date_month	1	3	9	12
meal	1	2	9	5
country	1	2	4	166
market_segment	1	6	13	7
distribution_channel	1	3	9	5
reserved_room_type	1	1	1	9
assigned_room_type	1	1	1	10
deposit_type	1	10	10	3
agent	1	1	4	315
company	1	1	4	332
customer_type	1	5	15	4
reservation_status	1	9	9	1

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
reservation_status_date	0	1	2015-07-01	2017-09-14	2016-09-01	805

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
is_canceled	1	0.00	0.00	0.00	0.0	0.0	0	0	▁▁▇▁▁
lead_time	1	79.98	91.11	0.00	9.0	45.0	124	737	▇▂▁▁▁
arrival_date_year	1	2016.15	0.70	2015.00	2016.0	2016.0	2017	2017	▃▁▇▁▆
arrival_date_week_number	1	27.08	13.90	1.00	16.0	28.0	38	53	▆▇▇▇▆
arrival_date_day_of_month	1	15.84	8.78	1.00	8.0	16.0	23	31	▇▇▇▇▆
stays_in_weekend_nights	1	0.93	0.99	0.00	0.0	1.0	2	19	▇▁▁▁▁
stays_in_week_nights	1	2.46	1.92	0.00	1.0	2.0	3	50	▇▁▁▁▁
adults	1	1.83	0.51	0.00	2.0	2.0	2	4	▁▂▇▁▁
children	1	0.10	0.39	0.00	0.0	0.0	0	3	▇▁▁▁▁
babies	1	0.01	0.11	0.00	0.0	0.0	0	10	▇▁▁▁▁
is_repeated_guest	1	0.04	0.20	0.00	0.0	0.0	0	1	▇▁▁▁▁
previous_cancellations	1	0.02	0.27	0.00	0.0	0.0	0	13	▇▁▁▁▁
previous_bookings_not_canceled	1	0.20	1.81	0.00	0.0	0.0	0	72	▇▁▁▁▁
booking_changes	1	0.29	0.74	0.00	0.0	0.0	0	21	▇▁▁▁▁
days_in_waiting_list	1	1.59	14.78	0.00	0.0	0.0	0	379	▇▁▁▁▁
adr	1	99.99	49.21	-6.38	67.5	92.5	125	510	▇▆▁▁▁
required_car_parking_spaces	1	0.10	0.30	0.00	0.0	0.0	0	8	▇▁▁▁▁
total_of_special_requests	1	0.71	0.83	0.00	0.0	1.0	1	5	▇▁▁▁▁

#EDA

hotels %>%
  count(hotel)

## # A tibble: 2 x 2
##   hotel            n
##   <chr>        <int>
## 1 City Hotel   46228
## 2 Resort Hotel 28938

We have around twice the amount of people staying in City as we do in Resort.

1 Number of bookings per month

Let’s look at the date range of our data

hotels %>%
  group_by(arrival_date_year)%>%
  count(arrival_date_week_number)%>%
  ggplot(aes(x=arrival_date_week_number,y=n,group=factor(arrival_date_year) ,col=factor(arrival_date_year) ))+
  geom_point()+
  geom_line(size=1)+
  theme_bw()+
  labs(title="Number of Bookings by Week of Year",x="Arrival Week",y="Number of Bookings",col="Year")+
  scale_color_manual(values=c("#C8E6C9", "#66BB6A", "#1B5E20"))

This plot shows the total number of books by week of year, usually there are around 500-800 books per week.

hotels %>%
#  select(arrival_date_year,arrival_date_month,arrival_date_day_of_month )  %>%
     mutate(arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year)),
           # date = lubridate::make_datetime(arrival_date) 
           ) %>%
  group_by(hotel)%>%
  count(arrival_date)%>%
  ggplot(aes(x=arrival_date,y=n,group=hotel,col=hotel))+
  #geom_point()+
  geom_line()+
  theme_minimal()+
  theme(legend.position = "top")+
  scale_x_date(date_labels = "%B\n%Y" )+
  #geom_hline(yintercept = 35,col="black",size=2,lty=2)+
  labs(title="Number of Hotel Bookings Between July 2015 to August 2017\nFor City and Resort Hotel",x="Date",y="Number of Bookings",col="Hotel")

Here we see the number of daily books by date and hotel;. We can see that the resort hotel has a relatively stable number of bookings. The average number of daily bookings for resort hotel is 35. The average for city is less consistent, but the average is about 60 bookings per day.

hotels %>%
#  select(arrival_date_year,arrival_date_month,arrival_date_day_of_month )  %>%
     mutate(arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year)),
           # date = lubridate::make_datetime(arrival_date) 
           ) %>%
  group_by(hotel)%>%
  count(arrival_date)%>%
  mutate( rolling_avg = zoo::rollmean(n,k=7,fill=NA,align="right") ) %>% 
  ggplot(aes(x=arrival_date,y=rolling_avg,group=hotel,col=hotel))+
  #geom_point()+
  geom_line()+
  theme_minimal()+
  theme(legend.position = "top")+
  scale_x_date(date_labels = "%B\n%Y" )+
  #geom_hline(yintercept = 35,col="black",size=2,lty=2)+
  labs(title="7 Day Rolling Average of Hotel Bookings Between July 2015 to August 2017 For City and Resort Hotel",x="Date",y="Number of Bookings",col="Hotel")

2 Room Average Daily Rate and Room Stay

What has been the Average Daily Rate?

hotels %>%
#  select(arrival_date_year,arrival_date_month,arrival_date_day_of_month )  %>%
     mutate(arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year)),
           # date = lubridate::make_datetime(arrival_date) 
           ) %>%
group_by(arrival_date)%>%
  summarise(Daily_ADR = mean(adr),.groups="drop")%>%
  mutate(rolling_adr =  zoo::rollmean(Daily_ADR,k=7,fill=NA)  ) %>%
  pivot_longer(cols= c(Daily_ADR,rolling_adr) ) %>% 
  ggplot(aes(x=arrival_date,y=value,group=name,col=name))+
  geom_line()+
  scale_color_manual(values=c("#99ffbb", "#008000"),labels=c("Daily Value","7 Day Rolling Average") )+
  scale_x_date(date_labels = "%B\n%Y" , date_breaks = "4 month" , date_minor_breaks = "2 month" )+
  scale_y_continuous(labels = scales::label_dollar())+
  theme_minimal()+
  theme(legend.position = "top")+
  labs(title="Average Daily Rate for all Hotels",x="Arrival Date",y="Average Daily Rate",col=NULL)

We can see that there is a clear spike in the average daily rate around the month of Augest, similarly, we see a small, yet significant spike at the tail end of December. This should not be surprising since the former spike corresponds to the summer vacation, while the later corresponds to the winter festive season. Also of note is that there appears to be a positive trend, we can see this directly from the peaks taking a larger value as time passes.

hotels %>%
#  select(arrival_date_year,arrival_date_month,arrival_date_day_of_month )  %>%
     mutate(arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year)),
           # date = lubridate::make_datetime(arrival_date) 
           ) %>%
group_by(arrival_date,hotel)%>%
  summarise(Daily_ADR = mean(adr),.groups="keep")%>%
  ungroup(arrival_date)%>%
  mutate(rolling_adr =  zoo::rollmean(Daily_ADR,k=7,fill=NA,align="right")  ) %>%
 # pivot_longer(cols= c(Daily_ADR,rolling_adr) ) %>% 
  ggplot()+
  geom_col(aes(x=arrival_date,y=Daily_ADR,fill=hotel),alpha=0.2 )+
  geom_line(aes(x=arrival_date,y=rolling_adr,col=hotel),size=1)+
  #scale_color_manual(values=c("grey80", "red"))+
  facet_wrap(.~hotel,nrow=2)+
  #annotate( geom="text",x=as.Date(1,origin ="2017-01-01"),y=100,label="7 Day Rolling Average",col="Blue",size=5)+
  geom_text(data = . %>% filter(hotel=="City Hotel"), aes(x=as.Date(1,origin ="2016-02-01"),y=150,label="7 Day Rolling Average"),col="#F8766D",size=4,fontface="bold")+
  scale_x_date(date_labels = "%b\n%Y" , date_breaks = "2 month" , date_minor_breaks = "1 month" )+
  scale_y_continuous(labels = scales::label_dollar())+
  theme_bw()+
  theme(legend.position = "none")+
  labs(title = "Average daily rate for City and Resort Hotels with 7 day moving average",x="Arrival Date",y="Average Daily Rate")

We can further decompose the ADR by hotel, for City Hotel we can observe that there is a slight positive trend and we can see peaks around January. For Resort hotel, we can clearly observe a seasonal pattern. There is a big peak in August and a smaller peak in January. Interestingly, the peaks for Resort Hotel in August appear to be increasing.

hotels %>%
#  select(arrival_date_year,arrival_date_month,arrival_date_day_of_month )  %>%
     mutate(arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year)),
           # date = lubridate::make_datetime(arrival_date) 
           ) %>%
group_by(arrival_date,hotel)%>%
  summarise(Daily_ADR = mean(adr),.groups="keep")%>%
  ungroup(arrival_date)%>%
  mutate(rolling_adr =  zoo::rollmean(Daily_ADR,k=7,fill=NA,align="right")  ) %>%
 # pivot_longer(cols= c(Daily_ADR,rolling_adr) ) %>% 
  ggplot(aes(x=arrival_date,y=rolling_adr,col=hotel,group=hotel))+
  geom_line(size=1)+
  #scale_color_manual(values=c("grey80", "red"))+
  #facet_wrap(.~hotel,nrow=2)+
  scale_x_date(date_labels = "%b\n%Y" , date_breaks = "2 month" , date_minor_breaks = "1 month" )+
  scale_y_continuous(labels = scales::label_dollar())+
  #geom_smooth(method="lm",lty=2)+
  theme_bw()+
  theme(legend.position = "top")+
  labs(title = "7 Day Moving Average Rates for City and Resort Hotels",x="Arrival Date",y="Average Daily Rate",col="Hotel")

We now overlay the rolling averages, and we see that during the months of July to September, the ADR for resort hotel is higher than city hotel, whereas during the rest of the year the ADR for city hotel is higher than resort hotel.

Let’s look at the weekday breakdown

hotels %>%
#  select(arrival_date_year,arrival_date_month,arrival_date_day_of_month )  %>%
     mutate(arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year)),
            day_of_week = lubridate::wday(arrival_date,label=TRUE,week_start=1) ,
            quarter = lubridate::quarter(arrival_date,fiscal_start = 3),
            season = factor(quarter, labels = c("Spring","Summer","Autumn","Winter"))
           ) %>%
  group_by(hotel,day_of_week)%>%
  ggplot(aes(x=day_of_week,y=adr ))+
  #geom_violin(aes(fill=day_of_week) , draw_quantiles = c(.25,.5,.75))+
  #geom_violin(aes(fill=day_of_week))+ 
  geom_boxplot()+
  facet_grid(hotel~season,scales="free_y")+
  labs(title="ADR by Day of Season and Hotel",x="Day of Week",y="ADR")+
  theme_bw()+
  theme(legend.position="none")

We can see that the that the rates during Summer for Resort Hotels are much higher.

hotels %>%
  mutate( arrival_date_month = factor(arrival_date_month, levels =  month.name)  ) %>% 
  group_by(hotel,arrival_date_month,arrival_date_year)%>%
  count() %>% 
  ungroup(arrival_date_year) %>%
  summarise(mean_bookings  = mean(n)  ,.groups="drop") %>% 
#pivot_wider(names_from=arrival_date_month,values_from = mean_bookings)
  ggplot(aes(x=arrival_date_month,y=mean_bookings,group=hotel,col=hotel))+
  geom_line(size=1.5)+
  geom_point(size=5)+
  theme_minimal()+
  theme(legend.position = "top")+
  labs(title="Average Number of Bookings by Arrival Month",x="Month",y="Number of Bookings",col="Hotel")

This above graph shows how, on average, the number of bookings per month is higher in city hotel than resort hotel

hotels %>%
  select(adults,children,babies,stays_in_week_nights,stays_in_weekend_nights,hotel,adr) %>%
  ggplot()+
  geom_histogram(aes(x=adr,fill=hotel),binwidth = 5,col="white")+
  facet_wrap(.~hotel,nrow=2,scales = "free")+
  theme_bw()+
  theme(legend.position = "none")+
  scale_x_continuous(labels = scales::label_dollar())+
  labs(title = "Distribution of ADR by Hotel",x="ADR",y="Count")

We see that the average adr for city hotel is higher than the ADR for resort

Let’s find out out long customers typically stayed for

hotels %>%
  mutate( total_stay = stays_in_week_nights + stays_in_week_nights) %>%
  ggplot()+
  geom_histogram(aes(x=total_stay,fill=hotel),binwidth=2,col="white")+
  facet_wrap(.~hotel,nrow=2,scales="free")+
  theme_bw()+
  theme(legend.position = "none")+
  labs(title = "Histogram of Total Night Stayed at Hotel",x="Nights Stayed",y="Count")

The distribtuion of number of nights stayed are similar for both hotels.

hotels %>%
  mutate( total_stay = stays_in_week_nights + stays_in_week_nights, 
        arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year) )
        ) %>% 
  group_by(hotel,arrival_date)%>%
  summarise(Avg_stay = mean(total_stay),.groups="drop" ) %>%
  ggplot()+
  geom_line(aes(x=arrival_date,y=Avg_stay,col=hotel))+
  facet_wrap(.~hotel,nrow=2)+
  scale_x_date(date_labels = "%b\n%Y" , date_breaks = "2 month" , date_minor_breaks = "1 month" )+
  theme_bw()+
  theme(legend.position = "none")+
  labs(title="Average nights stayed by date and hotel",x="Arrival Date",y="Nights Stayed")

We clearly see that the number of nights stayed at the City Hotel is about 5 days, while the number of nights stayed at the resort hotel depends on the time of year. For example, during the months of January to March, there is a typical dip in the stay length, of around three to four months, but during the summer months, most notably, July to September, there is an increase to around eight to nine nights stayed.

hotels %>%
  mutate( total_stay = stays_in_week_nights + stays_in_week_nights, 
        arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year) )
        ) %>% 
  group_by(hotel,arrival_date)%>%
  summarise(Avg_stay = mean(total_stay),.groups="drop" ) %>%
  group_by(hotel) %>% 
  mutate(rolling_avg = zoo::rollmean(Avg_stay,k=7,fill=NA,align="right") ) %>% 
  ggplot()+
  geom_line(aes(x=arrival_date,y=rolling_avg,col=hotel,group=hotel))+
 # facet_wrap(.~hotel,nrow=2)+
  scale_x_date(date_labels = "%b\n%Y" , date_breaks = "2 month" , date_minor_breaks = "1 month" )+
  theme_bw()+
  theme(legend.position = "top")+
  labs(title="Average nights stayed by date and hotel",x="Arrival Date",y="Nights Stayed",col="Hotel")

We overlay to two graphs over each other. We see that outside the month of November to March, generally speaking, people spend more nights in the resort hotel than city hotel. In the months mentioned, the two hotels have similar lengths of stay.

hotels %>%
  mutate( total_stay = stays_in_week_nights + stays_in_week_nights, 
        arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year) ),
        days_stayed =  reservation_status_date - arrival_date 
        ) %>% 
  slice_sample(n=5)# %>%

## # A tibble: 5 x 35
##   hotel is_canceled lead_time arrival_date_ye~ arrival_date_mo~ arrival_date_we~
##   <chr>       <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 Reso~           0       107             2016 May                            19
## 2 Reso~           0         5             2016 March                          10
## 3 City~           0         0             2017 March                          10
## 4 City~           0        10             2017 January                         2
## 5 City~           0       320             2016 August                         34
## # ... with 29 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## #   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
## #   total_of_special_requests <dbl>, reservation_status <chr>,
## #   reservation_status_date <date>, total_stay <dbl>, arrival_date <date>,
## #   days_stayed <drtn>

  #arrange(days_stayed)

arr  <- hotels %>%  
  mutate( arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year))
          ) %>%
  group_by(arrival_date,hotel)%>%
  summarise(Arriving_Adults = sum(adults), .groups="drop" )


dep <- hotels %>%  
  group_by(reservation_status_date,hotel)%>%
  summarise(Departing_Adults = sum(adults), .groups="drop" )


dep_and_arr <-  arr %>% 
  full_join(dep,by=c("hotel","arrival_date"="reservation_status_date")) %>% 
filter( arrival_date <= "2017-08-31" ) # Study stopped collecting arrivals after this date

dep_and_arr  <- dep_and_arr  %>%
  mutate( Arriving_Adults = replace_na(Arriving_Adults,0),
          Departing_Adults= replace_na(Departing_Adults,0))

dep_and_arr %>% 
  group_by(hotel) %>% 
    mutate(change = Arriving_Adults - Departing_Adults,
         cumu_change = as.numeric(cumsum(change) )
    ) %>%
  ggplot()+
  geom_line(aes(x=arrival_date,y=cumu_change,col=hotel))+
  facet_wrap(.~hotel,nrow=2,scales = "free")+
  scale_x_date(date_labels = "%b\n%Y" , date_breaks = "2 month" , date_minor_breaks = "1 month" )+
  theme_bw()+
  theme(legend.position = "none")+
  labs(title="Number of adults staying at hotel from start of study",x="Date",y="Change in number of adults",subtitle = "Defined up to the initial number of customers at hotel")

This graph depicts the the change in the number of adults staying at the hotel. This is defined up to some constant since we don’t know the initial or the final number of adults staying at the hotel. We use the formula \(\text{Inital State} + \Delta\text{State} = \text{Final State}\)

Next question to ask How much revenue was generated by time.

hotels %>%
    mutate(total_stay = stays_in_week_nights + stays_in_week_nights) %>% 
  group_by(hotel,reservation_status_date)%>%
  summarise(total =  sum(adr*total_stay ), .groups="drop") %>%
  ungroup()%>%
  ggplot()+
   geom_line(aes(x=reservation_status_date,y=total,col=hotel))+ 
  facet_wrap(.~hotel,nrow=2,scales = "free")+
  scale_x_date(date_labels = "%b\n%Y" , date_breaks = "2 month" , date_minor_breaks = "1 month" )+
  scale_y_continuous(labels=scales::dollar_format())+
  theme_bw()+
  theme(legend.position = "none")+
  labs(title="Revenue from entire duration of stay by hotel",x="Check-out date",y="Revenue")

By multiplying the adr by the number of nights stayed, we can estimate how much a customer spent in total by check-out. We see some variability over the cause of the years, but we observe some noticeable spikes, such as during the final days of December. Interestingly enough there appears to be reduce variability about the late December spike, this could be due to the pent-up demand that was released.

Another odd thing to notice is a huge spike in June 2016 for City Hotel, there was an unprecedented spike in the amount of revenue gained. This represents a near 5x increase over the mean revenue during the entire study fore City Hotel.

hotels %>%
  filter(hotel=="City Hotel",
         reservation_status_date <= "2016-08-21" & reservation_status_date >= "2016-03-07") %>% 
    mutate(total_stay = stays_in_week_nights + stays_in_week_nights) %>% 
  group_by(reservation_status_date)%>%
  summarise(total =  sum(adr*total_stay ), .groups="drop")%>%
  ungroup() %>%
  ggplot()+
  geom_line(aes(x=reservation_status_date,y=total),col="#F8766D",size=1.5)+
  scale_x_date(date_labels = "%d %b\n%Y" , date_breaks = "2 weeks" , date_minor_breaks = "1 week" )+
  scale_y_continuous(labels=scales::dollar_format())+
   theme_bw()+
  labs(title="The huge revenue spike for City Hotel on 29th May 2016",x="Check-out Date",y="Revenue")

Let’s try to find why this has happened

hotels %>%
  filter(hotel=="City Hotel",
         reservation_status_date <= "2016-06-7" & reservation_status_date >= "2016-05-20"
  ) %>% 
  group_by(reservation_status_date) %>% 
  count(reservation_status_date) %>%
  ggplot()+
  geom_line(aes(x=reservation_status_date,y=n),col="#F8766D",size=2)+
  scale_x_date(date_labels = "%d %b\n%Y" , date_breaks = "2 days" , date_minor_breaks = "1 day" )+
  theme_bw()+
  labs(title="Number of Check-outs in May 2016 from City hotel",x="Check-out Date", y="Number of recorded check-outs")

The only solution I can come up with is there may have been an issue with internal data collection, or it the fact it was memeorial day, which happened to have occurred on 30th May for 2016. I find the latter cause to be unlikely, due to the fact that we should have observed similar spikes on memorial day for the other years, yet we do not see such spikes.

We can view the average ADR by day of week too.

hotels %>%
  mutate(day_of_week = lubridate::wday(reservation_status_date,label=TRUE,week_start=1)   ) %>%
  group_by(hotel,day_of_week) %>% 
  summarise(mean_adr = mean(adr),.groups="drop")%>%
  pivot_wider(names_from = "day_of_week", values_from="mean_adr")%>%
  knitr::kable(caption="Average ADR by hotel and day of week")

Average ADR by hotel and day of week
hotel	Mon	Tue	Wed	Thu	Fri	Sat	Sun
City Hotel	106.39412	105.02620	102.58313	105.88160	104.92217	107.5260	107.48796
Resort Hotel	94.37347	88.97903	84.09174	82.68951	90.48672	100.7189	93.01558

3 Country of Origin

Let’s switch gears to look at the country of origin of hotel guests

I will make an assumption that the country “CN” is “CAN”

countries <- hotels$country %>% 
  unique

countries <- countries %>% str_replace(.,"CN","CAN")

country_table = bind_cols(full_name = countrycode::countrycode(countries, origin = 'iso3c', destination = 'country.name') ,short_name= countries)

## Warning in countrycode::countrycode(countries, origin = "iso3c", destination = "country.name"): Some values were not matched unambiguously: NULL, TMP

hotels %>%
  filter(!country  %in% c("NULL","TEMP") ) %>% 
  mutate(country =str_replace(country,"CN","CAN"),
         country = fct_lump_n(country,n=20)
         )%>%
  count(country) %>%
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other")  ) %>% 
  ggplot(aes(x=n,y=reorder(full_name,n) ))+
  geom_col(fill="#087719")+
  geom_text(aes(label=  format(n, big.mark = ",", scientific = FALSE)) ,hjust=0.01)+
  theme_classic()+
  theme(panel.grid.major.x = element_line(),
        panel.grid.minor.x = element_line(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank()
        )+
  scale_x_continuous(labels=scales::comma_format() , expand = c(0,0) ,limits = c(0,24000) )+
  labs(title="Top 20 Most Common Countries of Origin",x="Number of Visitors",y="Home Country")

Here we see that the most common visitor countries of origin are: Portugal, followed by the United Kingdom, and then France, Spain, and Germany follow. This shows that it is very likely that the hotels are located in Portugal since it is the most common country of origin by a large margin. We also have many European visitors suggesting it is near the European mainland.

hotels %>%
  group_by(hotel) %>% 
  filter(!country  %in% c("NULL","TEMP") ) %>% 
  mutate(country =str_replace(country,"CN","CAN"),
         country = fct_lump_n(country,n=10)
         )%>%
  count(country) %>%
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other")  ) %>% 
  ggplot(aes(x=n, y = tidytext::reorder_within(full_name,n,hotel) ))+
  geom_col(aes(fill=hotel))+
  geom_text(aes(label=  format(n, big.mark = ",", scientific = FALSE)) ,hjust=0.01)+
  tidytext::scale_y_reordered()+
  facet_wrap(.~hotel, scales = "free")+
  theme_minimal()+
  theme(panel.grid.major.x = element_line(),
        panel.grid.minor.x = element_line(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        legend.position = "none",
        strip.background = element_rect(colour = "black", fill = "grey90")
        )+
  scale_x_continuous(labels=scales::comma_format() , expand = c(0,0) ,limits = c(0,14000) )+
  labs(title="Top 20 most common countries of origin by hotel",x="Number of Visitors",y="Home Country")

hotels %>%
  filter(!country  %in% c("NULL","TEMP") ,
         hotel == "City Hotel") %>% 
  mutate(country =str_replace(country,"CN","CAN"),
    #    arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year))
     arrival_date_month = factor(arrival_date_month, levels = month.name) 
         )%>%
  group_by(arrival_date_month ) %>% 
  count(country,sort = TRUE) %>%
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other")  ) %>%
  slice_max(n,n=5) %>%
  ggplot(aes(x=n,y= tidytext::reorder_within(full_name,n,arrival_date_month) ))+
  geom_col(aes(fill=arrival_date_month))+
  geom_text(aes(label=n),hjust=-.1,size=3)+
  tidytext::scale_y_reordered()+
  facet_wrap(.~arrival_date_month,scales = "free_y")+
  theme_minimal()+
    theme(panel.grid.major.y = element_blank(),
      #    panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
        axis.ticks.y = element_blank(),
        #axis.text.x = element_blank(),
        axis.line.y = element_blank(),
        legend.position = "none",
        strip.background = element_rect(colour = "black", fill = "grey90")
        )+
    scale_x_continuous(labels=scales::comma_format() , expand = c(0,0) ,limits = c(0,1400) )+
  labs(title="Top 5 most common booking country of origin by month for City Hotel",x="Number of bookings", y="")

hotels %>%
  filter(!country  %in% c("NULL","TEMP") ,
         hotel == "Resort Hotel") %>% 
  mutate(country =str_replace(country,"CN","CAN"),
    #    arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year))
     arrival_date_month = factor(arrival_date_month, levels = month.name) 
         )%>%
  group_by(arrival_date_month ) %>% 
  count(country,sort = TRUE) %>%
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other")  ) %>%
  slice_max(n,n=5) %>%
  ggplot(aes(x=n,y= tidytext::reorder_within(full_name,n,arrival_date_month) ))+
  geom_col(aes(fill=arrival_date_month))+
  geom_text(aes(label=n),hjust=-.1,size=3)+
  tidytext::scale_y_reordered()+
  facet_wrap(.~arrival_date_month,scales = "free_y")+
  theme_minimal()+
    theme(panel.grid.major.y = element_blank(),
      #    panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
        axis.ticks.y = element_blank(),
        #axis.text.x = element_blank(),
        axis.line.y = element_blank(),
        legend.position = "none",
        strip.background = element_rect(colour = "black", fill = "grey90")
        )+
    scale_x_continuous(labels=scales::comma_format() , expand = c(0,0) ,limits = c(0,1600) )+
  labs(title="Top 5 most common booking country of origin by month for Resort Hotel",x="Number of bookings", y="")

all_hotels_top = hotels %>%
  filter(!country  %in% c("NULL","TEMP") ) %>% 
  mutate(country =str_replace(country,"CN","CAN"),
   #      country = fct_lump_n(country,n=20)
         )%>%
  count(country,sort=TRUE) %>%
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other")  ) %>%
  slice_head(n=5)%>%
  pluck("full_name")

#all_hotels_top

hotels %>%
  filter(!country  %in% c("NULL","TEMP") ) %>% 
  mutate(country =str_replace(country,"CN","CAN"),
    #    arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year))
     arrival_date_month = factor(arrival_date_month, levels = month.name)
         )%>%
  group_by(arrival_date_month,arrival_date_year ) %>% 
  count(country,sort = TRUE) %>%
  ungroup() %>%
  group_by(country,arrival_date_month) %>% 
    summarise(mean_num = mean(n), .groups = "drop"  ) %>% 
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other"),
         mean_num = round(mean_num)) %>%
  mutate(full_name = fct_reorder(full_name, mean_num, tail, n = 1, .desc = TRUE)) %>%
  mutate(  arrival_date_month   = factor(month.abb[arrival_date_month],levels=month.abb) ) %>% 
  ggplot()+
  geom_line( data= .%>%  filter(full_name %in% all_hotels_top  ) ,
             aes(x=arrival_date_month,y=mean_num,group=full_name,col=full_name),
             size=1.2)+
  geom_line( data= .%>%  filter(!full_name %in% all_hotels_top  ) ,
             aes(x=arrival_date_month,y=mean_num,group=full_name),
             alpha=0.2)+
  theme_minimal()+
  labs(title="Mean number of origin of country bookings for all hotels",x=NULL,y="Number of bookings",col="Country")

People from Portugal were the most common group. Then we see the other countries following behind, where the second most common country of origin depends on the month. For example, the months from April to July and September to October, British people were second most common. Spanish people were the most common in August and December. French were the most common in November.

# Old Version of City of Hotel

# hotels %>%
#   filter(!country  %in% c("NULL","TEMP"),
#          hotel=="City Hotel") %>% 
#   mutate(country =str_replace(country,"CN","CAN"),
#     #    arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year))
#      arrival_date_month = factor(arrival_date_month, levels = month.name)
#          )%>%
#   group_by(arrival_date_month,arrival_date_year ) %>% 
#   count(country,sort = TRUE) %>%
#   ungroup() %>%
#   group_by(country,arrival_date_month) %>% 
#     summarise(mean_num = mean(n), .groups = "drop"  ) %>% 
#   left_join(country_table,by=c("country"="short_name"))%>%
#   mutate(full_name = replace_na(full_name,"Other"),
#          mean_num = round(mean_num)) %>%
#   mutate(full_name = fct_reorder(full_name, mean_num, tail, n = 1, .desc = TRUE)) %>%
#   mutate(  arrival_date_month   = factor(month.abb[arrival_date_month],levels=month.abb) ) %>% 
#   ggplot()+
#   geom_line( data= .%>%  filter(full_name %in% all_hotels_top  ) ,
#              aes(x=arrival_date_month,y=mean_num,group=full_name,col=full_name),
#              size=1.2)+
#   geom_line( data= .%>%  filter(!full_name %in% all_hotels_top  ) ,
#              aes(x=arrival_date_month,y=mean_num,group=full_name),
#              alpha=0.2)+
#   theme_minimal()+
#   labs(title="Mean number of origin of country bookings for city hotel",x=NULL,y="Number of bookings",col="Country")

 ## Old version of Resort Hotel

# hotels %>%
#   filter(!country  %in% c("NULL","TEMP"),
#          hotel == "Resort Hotel") %>%
#   mutate(country =str_replace(country,"CN","CAN"),
#     #    arrival_date =  lubridate::dmy( paste(arrival_date_day_of_month,arrival_date_month,arrival_date_year))
#      arrival_date_month = factor(arrival_date_month, levels = month.name)
#          )%>%
#   group_by(arrival_date_month,arrival_date_year ) %>%
#   count(country,sort = TRUE) %>%
#   ungroup() %>%
#   group_by(country,arrival_date_month) %>%
#     summarise(mean_num = mean(n), .groups = "drop"  ) %>%
#   left_join(country_table,by=c("country"="short_name"))%>%
#   mutate(full_name = replace_na(full_name,"Other"),
#          mean_num = round(mean_num)) %>%
#   mutate(full_name = fct_reorder(full_name, mean_num, tail, n = 1, .desc = TRUE)) %>%
#   mutate(  arrival_date_month   = factor(month.abb[arrival_date_month],levels=month.abb) ) %>%
#   ggplot()+
#   geom_line( data= .%>%  filter(full_name %in% all_hotels_top  ) ,
#              aes(x=arrival_date_month,y=mean_num,group=full_name,col=full_name),
#              size=1.2)+
#   geom_line( data= .%>%  filter(!full_name %in% all_hotels_top  ) ,
#              aes(x=arrival_date_month,y=mean_num,group=full_name),
#              alpha=0.2)+
#   theme_minimal()+
#   labs(title="Mean number of origin of country bookings for all hotels",x=NULL,y="Number of bookings",col="Country")

hotels %>%
  filter(!country  %in% c("NULL","TEMP") ) %>% 
  mutate(country =str_replace(country,"CN","CAN"),
     arrival_date_month = factor(arrival_date_month, levels = month.name)
         )%>%
  group_by(arrival_date_month,arrival_date_year,hotel ) %>% 
  count(country,sort = TRUE) %>%
  ungroup() %>%
  group_by(country,arrival_date_month,hotel) %>% 
    summarise(mean_num = mean(n), .groups = "drop"  ) %>% 
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other"),
         mean_num = round(mean_num)) %>%
  mutate(full_name = fct_reorder(full_name, mean_num, tail, n = 1, .desc = TRUE)) %>%
  mutate(  arrival_date_month   = factor(month.abb[arrival_date_month],levels=month.abb) ) %>% 
  ggplot()+
    geom_line( data= .%>%  filter(!full_name %in% all_hotels_top  ) ,
             aes(x=arrival_date_month,y=mean_num,group=full_name),
             alpha=0.2)+
  geom_line( data= .%>%  filter(full_name %in% all_hotels_top  ) ,
             aes(x=arrival_date_month,y=mean_num,group=full_name,col=full_name),
             size=1.2)+
  facet_wrap(.~hotel)+
  theme_bw()+
  theme(legend.position = "top",
        panel.grid.major.x = element_blank())+
  labs(title="Average monthly bookings by country of origin",x=NULL,y="Number of bookings",col="Country")

We notice several interesting behaviours:

Portugal regularly the top the list of most common country, but experiences some monthly changes.
French and Germans tend to prefer the city hotel rather than the resort hotel.
People from the United Kingdom tend to prefer resorts over city of hotel, to the point where they made up the most common country for several months, surpassing the Portuguese.

hotels %>%
  filter(!country  %in% c("NULL","TEMP") ) %>% 
  mutate(country =str_replace(country,"CN","CAN"),
             country = fct_lump_n(country,n=30),
     arrival_date_month = factor(arrival_date_month, levels = month.name)
         )%>%
  group_by(hotel ) %>% 
  count(country,sort = TRUE) %>%
  ungroup()%>%
  group_by(country) %>% 
  mutate(prop = n/sum(n))%>%
  filter(hotel == "City Hotel") %>%
  #mutate( prop = prop-0.5) %>%
  left_join(country_table,by=c("country"="short_name"))%>%
  mutate(full_name = replace_na(full_name,"Other*") ) %>% 
  filter(full_name !="Other*") %>% 
  mutate(full_name =  glue::glue("{full_name} ({n})") )%>%
  ggplot(aes(x=prop,y=reorder(full_name,prop) ))+
#  geom_col()+
  geom_col(aes(fill= prop > 0.5))+
  scale_x_continuous(labels = scales::percent_format(), breaks = round(seq(0,1,0.1),1), minor_breaks = NULL, expand=c(0,0),limits = c(0,1.05) )+
#  scale_x_continuous(trans =  scales::trans_new("shift",
#                             transform = function(x) {x-0.5},
#                             inverse = function(x) {x+0.5}),
#                     labels = scales::percent_format(), breaks = round(seq(0.1,1,0.1),1), minor_breaks = NULL, limits = c(0.29,1) )+
#  geom_vline(xintercept = 0.5,lty=5,col="grey50",size=1)+
  geom_vline(xintercept = 0.608,lty=5,col="grey50",size=1,alpha=0.5)+
 # geom_text(aes(label= scales::percent(prop,accuracy=1),hjust=  if_else(prop>0.5,-.1,1) ),size=3)+
  geom_text(aes(label= scales::percent(prop,accuracy=0.1),hjust=-.1),size=3)+
  annotate("text",x=0.7, y=3,label="Average Country\nPercent\n(60.8%)",col="grey50",size=3)+
  theme_minimal()+
  theme(legend.position = "bottom") +
  labs(title="Percent of people from countries that went to city hotel", subtitle = "Country (Number of bookings at city)",
       x="Percent destination to city ", y="Country",fill="Majority Destination")+
  scale_fill_manual(values = c("#F8766D","#00BFC4"), labels = c("Resort","City"), guide = guide_legend(reverse = TRUE))

We see that Israelis preferred going to the city hotel over the resort hotel; 484 chose to do so. The Canadians, British, and Irish all preferred going to the resort than the city.

hotels %>%
  filter(!country  %in% c("NULL","TEMP") ) %>% 
  mutate(country =str_replace(country,"CN","CAN"),
             country = fct_lump_n(country,n=20),
     arrival_date_month = factor(arrival_date_month, levels = month.name)
         )%>%
  filter(country != "Other") %>% 
  group_by(hotel ) %>% 
  count(country,sort = TRUE) %>%
  ungroup()%>%
  group_by(country) %>% 
  mutate(prop = n/sum(n))%>%
  ungroup() %>% 
  group_by(hotel) %>%
  summarise(t_count = sum(n)) %>%
  ungroup() %>% 
  mutate( prop =t_count/sum(t_count)    )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 3
##   hotel        t_count  prop
##   <chr>          <int> <dbl>
## 1 City Hotel     42575 0.609
## 2 Resort Hotel   27381 0.391

The average proportion that went to city or resort hotel, when the least common countries were filtered out.

4 Meals

hotels %>% 
  mutate(is_repeated_guest = factor(is_repeated_guest),
         meal = case_when(   meal == "BB" ~ "BB",
                                meal == "HB" ~ "HB",
                                meal == "FB" ~ "FB",
                                TRUE ~ "SC")  
         )%>% 
  group_by(hotel) %>% 
  count(meal) %>%
#  ungroup()%>%
  ggplot(aes(x=meal,y=n,fill=hotel))+
  geom_col( position = position_dodge() )+
  geom_text(aes(label=n),position = position_dodge(width=.9),vjust=-.5)+
  #facet_wrap(.~is_repeated_guest, nrow=2) +
  theme_minimal()+
  theme(panel.grid.major.x = element_blank() )+
  labs(title="Number of Meals Booked by Hotel",x="Meal",y="Number of Meals",fill="Hotel")

We see the most common mean orderd at both hotels were the bed board, i.ie a bed and breakfast.

Do meals types change during the year?

hotels %>%
    mutate( 
      arrival_date_month = factor(arrival_date_month, levels =  month.name) ,
      meal = case_when(   meal == "BB" ~ "BB",
                                meal == "HB" ~ "HB",
                                meal == "FB" ~ "FB",
                                TRUE ~ "SC")  
         ) %>% 
  
  count(hotel,arrival_date_month,meal) %>%
  ggplot(aes(x=arrival_date_month,y=n,fill=meal,group=meal)  ) +
  geom_col(position = "fill",alpha=0.8)+
  facet_wrap(.~hotel,nrow=2)+
  theme_bw()+
  theme(legend.position = "top"  )+
  labs(title = "Proportions of Meals Ordered by Month and Hotel",x="Month",y="Proportion",fill="Meal")

We see some changes in the meals ordered by month.

Finally we see a table of the number of meals ordered by hotel and month.

hotels %>%
    mutate( 
      arrival_date_month = factor(arrival_date_month, levels =  month.name) ,
      meal = case_when(   meal == "BB" ~ "BB",
                                meal == "HB" ~ "HB",
                                meal == "FB" ~ "FB",
                                TRUE ~ "SC")  
         ) %>% 
  
  count(hotel,arrival_date_month,meal) %>%
  group_by(hotel) %>%
  pivot_wider(names_from = arrival_date_month,values_from=n)%>%
  ungroup() %>% 
  select(-hotel) %>% 
  rename(" " = meal) %>% 
  kableExtra::kbl(caption = "Number of meals ordered during period by month and hotel")%>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  kableExtra::group_rows(index = c("City Hotel" = 4, "Resort Hotel" = 4)) %>% 
  kableExtra::add_header_above(c("Meal Type", "Month" = 12))

Number of meals ordered during period by month and hotel
Meal Type	Month
	January	February	March	April	May	June	July	August	September	October	November	December
City Hotel
BB	1771	2429	3248	3032	3591	3215	3339	3940	3264	3567	2304	1938
FB	1	1	3	1	NA	1	NA	NA	1	NA	NA	1
HB	130	117	160	323	273	423	721	654	606	402	34	137
SC	352	517	661	659	715	727	722	787	419	368	358	316
Resort Hotel
BB	1562	1840	2061	1796	2012	1595	2304	2234	1552	2166	1610	1430
FB	6	45	10	14	10	13	71	71	13	15	20	23
HB	182	295	358	578	503	409	722	915	528	386	292	331
SC	118	128	144	162	10	21	40	37	9	10	54	233

Tidy Tuesday Rewind: Hotels

11/11/2020

1 Number of bookings per month

2 Room Average Daily Rate and Room Stay

3 Country of Origin

4 Meals