Hotel booking cancellations

Dataset

Research question

What are the key variables for predicting hotel booking cancellations?
- logistic regression, decision tree, random forest

Load libraries

library(tidyverse)
library(Hmisc)
library(psych)
library(skimr)
library(GGally)

Import data

df=read.csv("hotel_bookings.csv",header=TRUE)
dim(df)

## [1] 119390     32

str(df)

## 'data.frame':    119390 obs. of  32 variables:
##  $ hotel                         : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
##  $ is_canceled                   : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ lead_time                     : int  342 737 7 13 14 14 0 9 85 75 ...
##  $ arrival_date_year             : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ arrival_date_month            : Factor w/ 12 levels "April","August",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ arrival_date_week_number      : int  27 27 27 27 27 27 27 27 27 27 ...
##  $ arrival_date_day_of_month     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : int  0 0 1 1 2 2 2 2 3 3 ...
##  $ adults                        : int  2 2 1 1 2 2 2 2 2 2 ...
##  $ children                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ babies                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ meal                          : Factor w/ 5 levels "BB","FB","HB",..: 1 1 1 1 1 1 1 2 1 3 ...
##  $ country                       : Factor w/ 178 levels "ABW","AGO","AIA",..: 137 137 60 60 60 60 137 137 137 137 ...
##  $ market_segment                : Factor w/ 8 levels "Aviation","Complementary",..: 4 4 4 3 7 7 4 4 7 6 ...
##  $ distribution_channel          : Factor w/ 5 levels "Corporate","Direct",..: 2 2 2 1 4 4 2 2 4 4 ...
##  $ is_repeated_guest             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : Factor w/ 10 levels "A","B","C","D",..: 3 3 1 1 1 1 3 3 1 4 ...
##  $ assigned_room_type            : Factor w/ 12 levels "A","B","C","D",..: 3 3 3 1 1 1 3 3 1 4 ...
##  $ booking_changes               : int  3 4 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ agent                         : Factor w/ 334 levels "1","10","103",..: 334 334 334 157 103 103 334 156 103 40 ...
##  $ company                       : Factor w/ 353 levels "10","100","101",..: 353 353 353 353 353 353 353 353 353 353 ...
##  $ days_in_waiting_list          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : Factor w/ 4 levels "Contract","Group",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ adr                           : num  0 0 75 75 98 ...
##  $ required_car_parking_spaces   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ reservation_status            : Factor w/ 3 levels "Canceled","Check-Out",..: 2 2 2 2 2 2 2 2 1 1 ...
##  $ reservation_status_date       : Factor w/ 926 levels "2014-10-17","2014-11-18",..: 122 122 123 123 124 124 124 124 73 62 ...

Data preparation

Missing value analysis

mva = apply(ifelse(df == 'NULL', 1, 0), 2, sum)
df = df %>% mutate_at(vars(children), ~replace(.,is.na(.),0))
summary(df$children)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1039  0.0000 10.0000

Change variable type

df = df %>% mutate_at (vars(is_canceled,arrival_date_year,arrival_date_week_number,is_repeated_guest), list(factor))
df$reservation_status_date = as.Date(df$reservation_status_date)

Create new variables

df1=df 
df1$nights= df1$stays_in_weekend_nights + df1$stays_in_week_nights
df1$people= df1$adults + df1$children + df1$babies
df1$children_babies= df1$children + df1$babies
df1$is_family= ifelse(df1$children_babies>=1,"1","0")
df1$is_waitlisted= ifelse(df1$days_in_waiting_list>=1,"1","0")
df1$reserved_room_type = as.character(df1$reserved_room_type)
df1$assigned_room_type = as.character(df1$assigned_room_type)
df1$room_type_change= ifelse(df1$reserved_room_type == df1$assigned_room_type,"0","1")
df1$company = as.character(df1$company)
df1$agent = as.character(df1$agent)
df1$from_company = ifelse(grepl("NULL", df1$company),0,1)
df1$from_agent = ifelse(grepl("NULL", df1$agent),0,1)
df1$has_pcancel = ifelse(df1$previous_cancellations>=1,"1","0")
df1$has_pbnc = ifelse(df1$previous_bookings_not_canceled>=1,"1","0")
df1$has_bchanges = ifelse(df1$booking_changes>=1,"1","0")
df1$has_srequests = ifelse(df1$total_of_special_requests>=1,"1","0")
df1$req_parking = ifelse(df1$required_car_parking_spaces>=1,"1","0")

Summary

df1 = df1 %>% mutate_at (vars(reserved_room_type,assigned_room_type, room_type_change, is_family, is_waitlisted, agent,company, has_pcancel, has_pbnc, has_bchanges, has_srequests, from_company, from_agent, req_parking), list(factor))
skim(df1)

Data summary
Name	df1
Number of rows	119390
Number of columns	45
_______________________
Column type frequency:
Date	1
factor	27
numeric	17
________________________
Group variables

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
reservation_status_date	0	1	2014-10-17	2017-09-14	2016-08-07	926

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
hotel	1	FALSE	2	Cit: 79330, Res: 40060
is_canceled	1	FALSE	2	0: 75166, 1: 44224
arrival_date_year	1	FALSE	3	201: 56707, 201: 40687, 201: 21996
arrival_date_month	1	FALSE	12	Aug: 13877, Jul: 12661, May: 11791, Oct: 11160
arrival_date_week_number	1	FALSE	53	33: 3580, 30: 3087, 32: 3045, 34: 3040
meal	1	FALSE	5	BB: 92310, HB: 14463, SC: 10650, Und: 1169
country	1	FALSE	178	PRT: 48590, GBR: 12129, FRA: 10415, ESP: 8568
market_segment	1	FALSE	8	Onl: 56477, Off: 24219, Gro: 19811, Dir: 12606
distribution_channel	1	FALSE	5	TA/: 97870, Dir: 14645, Cor: 6677, GDS: 193
is_repeated_guest	1	FALSE	2	0: 115580, 1: 3810
reserved_room_type	1	FALSE	10	A: 85994, D: 19201, E: 6535, F: 2897
assigned_room_type	1	FALSE	12	A: 74053, D: 25322, E: 7806, F: 3751
deposit_type	1	FALSE	3	No : 104641, Non: 14587, Ref: 162
agent	1	FALSE	334	9: 31961, NUL: 16340, 240: 13922, 1: 7191
company	1	FALSE	353	NUL: 112593, 40: 927, 223: 784, 67: 267
customer_type	1	FALSE	4	Tra: 89613, Tra: 25124, Con: 4076, Gro: 577
reservation_status	1	FALSE	3	Che: 75166, Can: 43017, No-: 1207
is_family	1	FALSE	2	0: 110058, 1: 9332
is_waitlisted	1	FALSE	2	0: 115692, 1: 3698
room_type_change	1	FALSE	2	0: 104473, 1: 14917
from_company	1	FALSE	2	0: 112593, 1: 6797
from_agent	1	FALSE	2	1: 103050, 0: 16340
has_pcancel	1	FALSE	2	0: 112906, 1: 6484
has_pbnc	1	FALSE	2	0: 115770, 1: 3620
has_bchanges	1	FALSE	2	0: 101314, 1: 18076
has_srequests	1	FALSE	2	0: 70318, 1: 49072
req_parking	1	FALSE	2	0: 111974, 1: 7416

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
lead_time	1	104.01	106.86	0.00	18.00	69.00	160	737	▇▂▁▁▁
arrival_date_day_of_month	1	15.80	8.78	1.00	8.00	16.00	23	31	▇▇▇▇▆
stays_in_weekend_nights	1	0.93	1.00	0.00	0.00	1.00	2	19	▇▁▁▁▁
stays_in_week_nights	1	2.50	1.91	0.00	1.00	2.00	3	50	▇▁▁▁▁
adults	1	1.86	0.58	0.00	2.00	2.00	2	55	▇▁▁▁▁
children	1	0.10	0.40	0.00	0.00	0.00	0	10	▇▁▁▁▁
babies	1	0.01	0.10	0.00	0.00	0.00	0	10	▇▁▁▁▁
previous_cancellations	1	0.09	0.84	0.00	0.00	0.00	0	26	▇▁▁▁▁
previous_bookings_not_canceled	1	0.14	1.50	0.00	0.00	0.00	0	72	▇▁▁▁▁
booking_changes	1	0.22	0.65	0.00	0.00	0.00	0	21	▇▁▁▁▁
days_in_waiting_list	1	2.32	17.59	0.00	0.00	0.00	0	391	▇▁▁▁▁
adr	1	101.83	50.54	-6.38	69.29	94.58	126	5400	▇▁▁▁▁
required_car_parking_spaces	1	0.06	0.25	0.00	0.00	0.00	0	8	▇▁▁▁▁
total_of_special_requests	1	0.57	0.79	0.00	0.00	0.00	1	5	▇▁▁▁▁
nights	1	3.43	2.56	0.00	2.00	3.00	4	69	▇▁▁▁▁
people	1	1.97	0.72	0.00	2.00	2.00	2	55	▇▁▁▁▁
children_babies	1	0.11	0.41	0.00	0.00	0.00	0	10	▇▁▁▁▁

Vizualisations

is canceled and hotel

Hmisc::describe(df1$hotel)

## df1$hotel 
##        n  missing distinct 
##   119390        0        2 
##                                     
## Value        City Hotel Resort Hotel
## Frequency         79330        40060
## Proportion        0.664        0.336

table(df1$is_canceled,df1$hotel)

##    
##     City Hotel Resort Hotel
##   0      46228        28938
##   1      33102        11122

ggplot(df1, aes(x= is_canceled,  group=hotel)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
    geom_text(aes( label = scales::percent(..prop..),
                   y= ..prop.. ), stat= "count", vjust = -.5) +
    labs(y = "Percent", fill="is_canceled") +
    facet_grid(~hotel) +
    scale_y_continuous(labels = scales::percent)

Pair plot 1

df1 %>% select (is_canceled, is_waitlisted, is_repeated_guest, deposit_type) %>% ggpairs(mapping = aes(color=is_canceled))

Pair plot 2

df1 %>% select (is_canceled, has_bchanges, has_srequests, req_parking, has_pcancel) %>% ggpairs(mapping = aes(color=is_canceled))

month, is_canceled and hotel

Hmisc::describe(df1$arrival_date_year)

## df1$arrival_date_year 
##        n  missing distinct 
##   119390        0        3 
##                             
## Value       2015  2016  2017
## Frequency  21996 56707 40687
## Proportion 0.184 0.475 0.341

table(df1$arrival_date_year, df1$arrival_date_month)

##       
##        April August December February January July June March  May November
##   2015     0   3889     2920        0       0 2776    0     0    0     2340
##   2016  5428   5063     3860     3891    2248 4572 5292  4824 5478     4454
##   2017  5661   4925        0     4177    3681 5313 5647  4970 6313        0
##       
##        October September
##   2015    4957      5114
##   2016    6203      5394
##   2017       0         0

df1 %>%
  mutate(arrival_date_month = factor(arrival_date_month,
    levels = month.name
  )) %>%
  count(hotel, arrival_date_month, is_canceled) %>%
  group_by(hotel, is_canceled) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(arrival_date_month, proportion, fill = is_canceled)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(
    x = NULL,
    y = "proportion of is_canceled",
    fill = NULL
  )

is_canceled, is_family and hotel

table(df1$is_family,df1$hotel)

##    
##     City Hotel Resort Hotel
##   0      73927        36131
##   1       5403         3929

df1 %>%
  count(hotel,is_canceled, is_family) %>%
  group_by(hotel, is_family) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(is_canceled, proportion, fill = is_family)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(
    x = "is_canceled",
    y = "proportion",
    fill = "is_family"
  )

distribution_channel, is_canceled and hotel

ggplot(df1, aes(x = distribution_channel, y = lead_time, fill = is_canceled)) + geom_boxplot(position = position_dodge()) +
  facet_wrap(~hotel, nrow = 5) +
  labs(
    x = "distribution_channel",
    y = "lead_time",
    fill = "is_canceled"
  )

market_segment, is_canceled and hotel

df1 %>%
  count(hotel,market_segment, is_canceled) %>%
  group_by(hotel, is_canceled) %>%
  mutate(proportion = n / sum(n)) %>%
  ggplot(aes(market_segment, proportion, fill = is_canceled)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  facet_wrap(~hotel, nrow = 2) +
  labs(
    x = "market_segment",
    y = "proportion",
    fill = "is_canceled"
  )

customer_type and hotel

checkout = df1 %>% filter(reservation_status == 'Check-Out') 
dim(checkout)

## [1] 75166    45

ggplot(checkout, aes(customer_type, fill=hotel)) + geom_bar(stat='count', position=position_dodge())

customer_type, avg. people and hotel

ggplot(checkout, aes(x= customer_type, y=people, fill= customer_type)) + geom_bar(stat="summary", fun="mean", position=position_dodge()) + facet_wrap(~hotel, nrow = 2)

customer_type, adr and hotel

ggplot(checkout, aes(x = customer_type, y = adr, fill = hotel)) + geom_boxplot(position = position_dodge())

market_segment and customer_type

ggplot(checkout) + geom_bar(aes(x= market_segment, fill= customer_type)) + facet_wrap(~hotel, nrow = 2)

Subset for data modeling

Country

Summary of country levels

country1 = count(df1,country)
summary(country1$n)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     2.00    12.50   670.73    74.75 48590.00

Select countries with >1500 count

subc = df1 %>% 
  group_by(country) %>% 
  filter(n() > 1500) %>%
  as.data.frame()
dim(subc)

## [1] 104627     45

Hmisc::describe(subc$country)

## subc$country 
##        n  missing distinct 
##   104627        0       12 
## 
## lowest : BEL BRA CHE DEU ESP, highest: IRL ITA NLD PRT USA
##                                                                             
## Value        BEL   BRA   CHE   DEU   ESP   FRA   GBR   IRL   ITA   NLD   PRT
## Frequency   2342  2224  1730  7287  8568 10415 12129  3375  3766  2104 48590
## Proportion 0.022 0.021 0.017 0.070 0.082 0.100 0.116 0.032 0.036 0.020 0.464
##                 
## Value        USA
## Frequency   2097
## Proportion 0.020

Outliers in ADR

boxplot(subc$adr)

Drop observations

df2= subc 
df2 = df2[df2$adr !=5400,]
df2=df2[df2$reserved_room_type !="P",]
df2=df2[df2$reserved_room_type !="L",]
df2=df2[df2$distribution_channel !="Undefined",]
df2=df2[df2$market_segment !="Undefined",]
df3a = df2[ df2$stays_in_weekend_nights> 0 | df2$stays_in_week_nights>0 ,]
dim(df3a)

## [1] 103938     45

df3b = df2[ df2$stays_in_weekend_nights== 0 & df2$stays_in_week_nights ==0 ,]
dim(df3b)

## [1] 675  45

Export subset

write.csv(df3a, "df3a.csv", row.names = FALSE)

Hotel booking cancellations - Part 1