Dataset

Hotel Booking Demand

Research question

Load libraries

library(tidyverse)
library(Hmisc)
library(psych)
library(skimr)
library(GGally)

Import data

df=read.csv("hotel_bookings.csv",header=TRUE)
dim(df)
## [1] 119390     32
str(df)
## 'data.frame':    119390 obs. of  32 variables:
##  $ hotel                         : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
##  $ is_canceled                   : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ lead_time                     : int  342 737 7 13 14 14 0 9 85 75 ...
##  $ arrival_date_year             : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ arrival_date_month            : Factor w/ 12 levels "April","August",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ arrival_date_week_number      : int  27 27 27 27 27 27 27 27 27 27 ...
##  $ arrival_date_day_of_month     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : int  0 0 1 1 2 2 2 2 3 3 ...
##  $ adults                        : int  2 2 1 1 2 2 2 2 2 2 ...
##  $ children                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ babies                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ meal                          : Factor w/ 5 levels "BB","FB","HB",..: 1 1 1 1 1 1 1 2 1 3 ...
##  $ country                       : Factor w/ 178 levels "ABW","AGO","AIA",..: 137 137 60 60 60 60 137 137 137 137 ...
##  $ market_segment                : Factor w/ 8 levels "Aviation","Complementary",..: 4 4 4 3 7 7 4 4 7 6 ...
##  $ distribution_channel          : Factor w/ 5 levels "Corporate","Direct",..: 2 2 2 1 4 4 2 2 4 4 ...
##  $ is_repeated_guest             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : Factor w/ 10 levels "A","B","C","D",..: 3 3 1 1 1 1 3 3 1 4 ...
##  $ assigned_room_type            : Factor w/ 12 levels "A","B","C","D",..: 3 3 3 1 1 1 3 3 1 4 ...
##  $ booking_changes               : int  3 4 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ agent                         : Factor w/ 334 levels "1","10","103",..: 334 334 334 157 103 103 334 156 103 40 ...
##  $ company                       : Factor w/ 353 levels "10","100","101",..: 353 353 353 353 353 353 353 353 353 353 ...
##  $ days_in_waiting_list          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : Factor w/ 4 levels "Contract","Group",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ adr                           : num  0 0 75 75 98 ...
##  $ required_car_parking_spaces   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ reservation_status            : Factor w/ 3 levels "Canceled","Check-Out",..: 2 2 2 2 2 2 2 2 1 1 ...
##  $ reservation_status_date       : Factor w/ 926 levels "2014-10-17","2014-11-18",..: 122 122 123 123 124 124 124 124 73 62 ...

Data preparation

Missing value analysis

mva = apply(ifelse(df == 'NULL', 1, 0), 2, sum)
df = df %>% mutate_at(vars(children), ~replace(.,is.na(.),0))
summary(df$children)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1039  0.0000 10.0000

Change variable type

df = df %>% mutate_at (vars(is_canceled,arrival_date_year,arrival_date_week_number,is_repeated_guest), list(factor))
df$reservation_status_date = as.Date(df$reservation_status_date)

Create new variables

df1=df 
df1$nights= df1$stays_in_weekend_nights + df1$stays_in_week_nights
df1$people= df1$adults + df1$children + df1$babies
df1$children_babies= df1$children + df1$babies
df1$is_family= ifelse(df1$children_babies>=1,"1","0")
df1$is_waitlisted= ifelse(df1$days_in_waiting_list>=1,"1","0")
df1$reserved_room_type = as.character(df1$reserved_room_type)
df1$assigned_room_type = as.character(df1$assigned_room_type)
df1$room_type_change= ifelse(df1$reserved_room_type == df1$assigned_room_type,"0","1")
df1$company = as.character(df1$company)
df1$agent = as.character(df1$agent)
df1$from_company = ifelse(grepl("NULL", df1$company),0,1)
df1$from_agent = ifelse(grepl("NULL", df1$agent),0,1)
df1$has_pcancel = ifelse(df1$previous_cancellations>=1,"1","0")
df1$has_pbnc = ifelse(df1$previous_bookings_not_canceled>=1,"1","0")
df1$has_bchanges = ifelse(df1$booking_changes>=1,"1","0")
df1$has_srequests = ifelse(df1$total_of_special_requests>=1,"1","0")
df1$req_parking = ifelse(df1$required_car_parking_spaces>=1,"1","0")

Summary

df1 = df1 %>% mutate_at (vars(reserved_room_type,assigned_room_type, room_type_change, is_family, is_waitlisted, agent,company, has_pcancel, has_pbnc, has_bchanges, has_srequests, from_company, from_agent, req_parking), list(factor))
skim(df1)
Data summary
Name df1
Number of rows 119390
Number of columns 45
_______________________
Column type frequency:
Date 1
factor 27
numeric 17
________________________
Group variables

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
hotel 0 1 FALSE 2 Cit: 79330, Res: 40060
is_canceled 0 1 FALSE 2 0: 75166, 1: 44224
arrival_date_year 0 1 FALSE 3 201: 56707, 201: 40687, 201: 21996
arrival_date_month 0 1 FALSE 12 Aug: 13877, Jul: 12661, May: 11791, Oct: 11160
arrival_date_week_number 0 1 FALSE 53 33: 3580, 30: 3087, 32: 3045, 34: 3040
meal 0 1 FALSE 5 BB: 92310, HB: 14463, SC: 10650, Und: 1169
country 0 1 FALSE 178 PRT: 48590, GBR: 12129, FRA: 10415, ESP: 8568
market_segment 0 1 FALSE 8 Onl: 56477, Off: 24219, Gro: 19811, Dir: 12606
distribution_channel 0 1 FALSE 5 TA/: 97870, Dir: 14645, Cor: 6677, GDS: 193
is_repeated_guest 0 1 FALSE 2 0: 115580, 1: 3810
reserved_room_type 0 1 FALSE 10 A: 85994, D: 19201, E: 6535, F: 2897
assigned_room_type 0 1 FALSE 12 A: 74053, D: 25322, E: 7806, F: 3751
deposit_type 0 1 FALSE 3 No : 104641, Non: 14587, Ref: 162
agent 0 1 FALSE 334 9: 31961, NUL: 16340, 240: 13922, 1: 7191
company 0 1 FALSE 353 NUL: 112593, 40: 927, 223: 784, 67: 267
customer_type 0 1 FALSE 4 Tra: 89613, Tra: 25124, Con: 4076, Gro: 577
reservation_status 0 1 FALSE 3 Che: 75166, Can: 43017, No-: 1207
is_family 0 1 FALSE 2 0: 110058, 1: 9332
is_waitlisted 0 1 FALSE 2 0: 115692, 1: 3698
room_type_change 0 1 FALSE 2 0: 104473, 1: 14917
from_company 0 1 FALSE 2 0: 112593, 1: 6797
from_agent 0 1 FALSE 2 1: 103050, 0: 16340
has_pcancel 0 1 FALSE 2 0: 112906, 1: 6484
has_pbnc 0 1 FALSE 2 0: 115770, 1: 3620
has_bchanges 0 1 FALSE 2 0: 101314, 1: 18076
has_srequests 0 1 FALSE 2 0: 70318, 1: 49072
req_parking 0 1 FALSE 2 0: 111974, 1: 7416

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737 ▇▂▁▁▁
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31 ▇▇▇▇▆
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19 ▇▁▁▁▁
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50 ▇▁▁▁▁
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55 ▇▁▁▁▁
children 0 1 0.10 0.40 0.00 0.00 0.00 0 10 ▇▁▁▁▁
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10 ▇▁▁▁▁
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26 ▇▁▁▁▁
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72 ▇▁▁▁▁
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21 ▇▁▁▁▁
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391 ▇▁▁▁▁
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400 ▇▁▁▁▁
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8 ▇▁▁▁▁
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5 ▇▁▁▁▁
nights 0 1 3.43 2.56 0.00 2.00 3.00 4 69 ▇▁▁▁▁
people 0 1 1.97 0.72 0.00 2.00 2.00 2 55 ▇▁▁▁▁
children_babies 0 1 0.11 0.41 0.00 0.00 0.00 0 10 ▇▁▁▁▁

Vizualisations

is canceled and hotel

Hmisc::describe(df1$hotel)
## df1$hotel 
##        n  missing distinct 
##   119390        0        2 
##                                     
## Value        City Hotel Resort Hotel
## Frequency         79330        40060
## Proportion        0.664        0.336
table(df1$is_canceled,df1$hotel)
##    
##     City Hotel Resort Hotel
##   0      46228        28938
##   1      33102        11122
ggplot(df1, aes(x= is_canceled,  group=hotel)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="is_canceled") +
    facet_grid(~hotel) +
    scale_y_continuous(labels = scales::percent)

Pair plot 1

df1 %>% select (is_canceled, is_waitlisted, is_repeated_guest, deposit_type) %>% ggpairs(mapping = aes(color=is_canceled))

Pair plot 2

df1 %>% select (is_canceled, has_bchanges, has_srequests, req_parking, has_pcancel) %>% ggpairs(mapping = aes(color=is_canceled))

month, is_canceled and hotel

Hmisc::describe(df1$arrival_date_year)
## df1$arrival_date_year 
##        n  missing distinct 
##   119390        0        3 
##                             
## Value       2015  2016  2017
## Frequency  21996 56707 40687
## Proportion 0.184 0.475 0.341
table(df1$arrival_date_year, df1$arrival_date_month)
##       
##        April August December February January July June March  May November
##   2015     0   3889     2920        0       0 2776    0     0    0     2340
##   2016  5428   5063     3860     3891    2248 4572 5292  4824 5478     4454
##   2017  5661   4925        0     4177    3681 5313 5647  4970 6313        0
##       
##        October September
##   2015    4957      5114
##   2016    6203      5394
##   2017       0         0
df1 %>%
  mutate(arrival_date_month = factor(arrival_date_month,
    levels = month.name
  )) %>%
  count(hotel, arrival_date_month, is_canceled) %>%
  group_by(hotel, is_canceled) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(arrival_date_month, proportion, fill = is_canceled)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(
    x = NULL,
    y = "proportion of is_canceled",
    fill = NULL
  )

is_canceled, is_family and hotel

table(df1$is_family,df1$hotel)  
##    
##     City Hotel Resort Hotel
##   0      73927        36131
##   1       5403         3929
df1 %>%
  count(hotel,is_canceled, is_family) %>%
  group_by(hotel, is_family) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(is_canceled, proportion, fill = is_family)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(
    x = "is_canceled",
    y = "proportion",
    fill = "is_family"
  )

distribution_channel, is_canceled and hotel

ggplot(df1, aes(x = distribution_channel, y = lead_time, fill = is_canceled)) + geom_boxplot(position = position_dodge()) +
  facet_wrap(~hotel, nrow = 5) +
  labs(
    x = "distribution_channel",
    y = "lead_time",
    fill = "is_canceled"
  )

market_segment, is_canceled and hotel

df1 %>%
  count(hotel,market_segment, is_canceled) %>%
  group_by(hotel, is_canceled) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(market_segment, proportion, fill = is_canceled)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(
    x = "market_segment",
    y = "proportion",
    fill = "is_canceled"
  )

customer_type and hotel

checkout = df1 %>% filter(reservation_status == 'Check-Out') 
dim(checkout)
## [1] 75166    45
ggplot(checkout, aes(customer_type, fill=hotel)) + geom_bar(stat='count', position=position_dodge())

customer_type, avg. people and hotel

ggplot(checkout, aes(x= customer_type, y=people, fill= customer_type)) + geom_bar(stat="summary", fun="mean", position=position_dodge()) + facet_wrap(~hotel, nrow = 2)

customer_type, adr and hotel

ggplot(checkout, aes(x = customer_type, y = adr, fill = hotel)) + geom_boxplot(position = position_dodge())

market_segment and customer_type

ggplot(checkout) + geom_bar(aes(x= market_segment, fill= customer_type)) + facet_wrap(~hotel, nrow = 2)

Subset for data modeling

Country

Summary of country levels

country1 = count(df1,country)
summary(country1$n)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     2.00    12.50   670.73    74.75 48590.00

Select countries with >1500 count

subc = df1 %>% 
  group_by(country) %>% 
  filter(n() > 1500) %>%
  as.data.frame()
dim(subc)
## [1] 104627     45
Hmisc::describe(subc$country)
## subc$country 
##        n  missing distinct 
##   104627        0       12 
## 
## lowest : BEL BRA CHE DEU ESP, highest: IRL ITA NLD PRT USA
##                                                                             
## Value        BEL   BRA   CHE   DEU   ESP   FRA   GBR   IRL   ITA   NLD   PRT
## Frequency   2342  2224  1730  7287  8568 10415 12129  3375  3766  2104 48590
## Proportion 0.022 0.021 0.017 0.070 0.082 0.100 0.116 0.032 0.036 0.020 0.464
##                 
## Value        USA
## Frequency   2097
## Proportion 0.020

Outliers in ADR

boxplot(subc$adr)

Drop observations

df2= subc 
df2 = df2[df2$adr !=5400,]
df2=df2[df2$reserved_room_type !="P",]
df2=df2[df2$reserved_room_type !="L",]
df2=df2[df2$distribution_channel !="Undefined",]
df2=df2[df2$market_segment !="Undefined",]
df3a = df2[ df2$stays_in_weekend_nights> 0 | df2$stays_in_week_nights>0 ,]
dim(df3a)
## [1] 103938     45
df3b = df2[ df2$stays_in_weekend_nights== 0 & df2$stays_in_week_nights ==0 ,]
dim(df3b)
## [1] 675  45

Export subset

write.csv(df3a, "df3a.csv", row.names = FALSE)