library(tidyverse)
library(Hmisc)
library(psych)
library(skimr)
library(GGally)
df=read.csv("hotel_bookings.csv",header=TRUE)
dim(df)
## [1] 119390 32
str(df)
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : Factor w/ 12 levels "April","August",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : Factor w/ 5 levels "BB","FB","HB",..: 1 1 1 1 1 1 1 2 1 3 ...
## $ country : Factor w/ 178 levels "ABW","AGO","AIA",..: 137 137 60 60 60 60 137 137 137 137 ...
## $ market_segment : Factor w/ 8 levels "Aviation","Complementary",..: 4 4 4 3 7 7 4 4 7 6 ...
## $ distribution_channel : Factor w/ 5 levels "Corporate","Direct",..: 2 2 2 1 4 4 2 2 4 4 ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : Factor w/ 10 levels "A","B","C","D",..: 3 3 1 1 1 1 3 3 1 4 ...
## $ assigned_room_type : Factor w/ 12 levels "A","B","C","D",..: 3 3 3 1 1 1 3 3 1 4 ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ agent : Factor w/ 334 levels "1","10","103",..: 334 334 334 157 103 103 334 156 103 40 ...
## $ company : Factor w/ 353 levels "10","100","101",..: 353 353 353 353 353 353 353 353 353 353 ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : Factor w/ 4 levels "Contract","Group",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : Factor w/ 3 levels "Canceled","Check-Out",..: 2 2 2 2 2 2 2 2 1 1 ...
## $ reservation_status_date : Factor w/ 926 levels "2014-10-17","2014-11-18",..: 122 122 123 123 124 124 124 124 73 62 ...
Missing value analysis
mva = apply(ifelse(df == 'NULL', 1, 0), 2, sum)
df = df %>% mutate_at(vars(children), ~replace(.,is.na(.),0))
summary(df$children)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1039 0.0000 10.0000
Change variable type
df = df %>% mutate_at (vars(is_canceled,arrival_date_year,arrival_date_week_number,is_repeated_guest), list(factor))
df$reservation_status_date = as.Date(df$reservation_status_date)
Create new variables
df1=df
df1$nights= df1$stays_in_weekend_nights + df1$stays_in_week_nights
df1$people= df1$adults + df1$children + df1$babies
df1$children_babies= df1$children + df1$babies
df1$is_family= ifelse(df1$children_babies>=1,"1","0")
df1$is_waitlisted= ifelse(df1$days_in_waiting_list>=1,"1","0")
df1$reserved_room_type = as.character(df1$reserved_room_type)
df1$assigned_room_type = as.character(df1$assigned_room_type)
df1$room_type_change= ifelse(df1$reserved_room_type == df1$assigned_room_type,"0","1")
df1$company = as.character(df1$company)
df1$agent = as.character(df1$agent)
df1$from_company = ifelse(grepl("NULL", df1$company),0,1)
df1$from_agent = ifelse(grepl("NULL", df1$agent),0,1)
df1$has_pcancel = ifelse(df1$previous_cancellations>=1,"1","0")
df1$has_pbnc = ifelse(df1$previous_bookings_not_canceled>=1,"1","0")
df1$has_bchanges = ifelse(df1$booking_changes>=1,"1","0")
df1$has_srequests = ifelse(df1$total_of_special_requests>=1,"1","0")
df1$req_parking = ifelse(df1$required_car_parking_spaces>=1,"1","0")
Summary
df1 = df1 %>% mutate_at (vars(reserved_room_type,assigned_room_type, room_type_change, is_family, is_waitlisted, agent,company, has_pcancel, has_pbnc, has_bchanges, has_srequests, from_company, from_agent, req_parking), list(factor))
skim(df1)
| Name | df1 |
| Number of rows | 119390 |
| Number of columns | 45 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| factor | 27 |
| numeric | 17 |
| ________________________ | |
| Group variables |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| reservation_status_date | 0 | 1 | 2014-10-17 | 2017-09-14 | 2016-08-07 | 926 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| hotel | 0 | 1 | FALSE | 2 | Cit: 79330, Res: 40060 |
| is_canceled | 0 | 1 | FALSE | 2 | 0: 75166, 1: 44224 |
| arrival_date_year | 0 | 1 | FALSE | 3 | 201: 56707, 201: 40687, 201: 21996 |
| arrival_date_month | 0 | 1 | FALSE | 12 | Aug: 13877, Jul: 12661, May: 11791, Oct: 11160 |
| arrival_date_week_number | 0 | 1 | FALSE | 53 | 33: 3580, 30: 3087, 32: 3045, 34: 3040 |
| meal | 0 | 1 | FALSE | 5 | BB: 92310, HB: 14463, SC: 10650, Und: 1169 |
| country | 0 | 1 | FALSE | 178 | PRT: 48590, GBR: 12129, FRA: 10415, ESP: 8568 |
| market_segment | 0 | 1 | FALSE | 8 | Onl: 56477, Off: 24219, Gro: 19811, Dir: 12606 |
| distribution_channel | 0 | 1 | FALSE | 5 | TA/: 97870, Dir: 14645, Cor: 6677, GDS: 193 |
| is_repeated_guest | 0 | 1 | FALSE | 2 | 0: 115580, 1: 3810 |
| reserved_room_type | 0 | 1 | FALSE | 10 | A: 85994, D: 19201, E: 6535, F: 2897 |
| assigned_room_type | 0 | 1 | FALSE | 12 | A: 74053, D: 25322, E: 7806, F: 3751 |
| deposit_type | 0 | 1 | FALSE | 3 | No : 104641, Non: 14587, Ref: 162 |
| agent | 0 | 1 | FALSE | 334 | 9: 31961, NUL: 16340, 240: 13922, 1: 7191 |
| company | 0 | 1 | FALSE | 353 | NUL: 112593, 40: 927, 223: 784, 67: 267 |
| customer_type | 0 | 1 | FALSE | 4 | Tra: 89613, Tra: 25124, Con: 4076, Gro: 577 |
| reservation_status | 0 | 1 | FALSE | 3 | Che: 75166, Can: 43017, No-: 1207 |
| is_family | 0 | 1 | FALSE | 2 | 0: 110058, 1: 9332 |
| is_waitlisted | 0 | 1 | FALSE | 2 | 0: 115692, 1: 3698 |
| room_type_change | 0 | 1 | FALSE | 2 | 0: 104473, 1: 14917 |
| from_company | 0 | 1 | FALSE | 2 | 0: 112593, 1: 6797 |
| from_agent | 0 | 1 | FALSE | 2 | 1: 103050, 0: 16340 |
| has_pcancel | 0 | 1 | FALSE | 2 | 0: 112906, 1: 6484 |
| has_pbnc | 0 | 1 | FALSE | 2 | 0: 115770, 1: 3620 |
| has_bchanges | 0 | 1 | FALSE | 2 | 0: 101314, 1: 18076 |
| has_srequests | 0 | 1 | FALSE | 2 | 0: 70318, 1: 49072 |
| req_parking | 0 | 1 | FALSE | 2 | 0: 111974, 1: 7416 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| lead_time | 0 | 1 | 104.01 | 106.86 | 0.00 | 18.00 | 69.00 | 160 | 737 | ▇▂▁▁▁ |
| arrival_date_day_of_month | 0 | 1 | 15.80 | 8.78 | 1.00 | 8.00 | 16.00 | 23 | 31 | ▇▇▇▇▆ |
| stays_in_weekend_nights | 0 | 1 | 0.93 | 1.00 | 0.00 | 0.00 | 1.00 | 2 | 19 | ▇▁▁▁▁ |
| stays_in_week_nights | 0 | 1 | 2.50 | 1.91 | 0.00 | 1.00 | 2.00 | 3 | 50 | ▇▁▁▁▁ |
| adults | 0 | 1 | 1.86 | 0.58 | 0.00 | 2.00 | 2.00 | 2 | 55 | ▇▁▁▁▁ |
| children | 0 | 1 | 0.10 | 0.40 | 0.00 | 0.00 | 0.00 | 0 | 10 | ▇▁▁▁▁ |
| babies | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0 | 10 | ▇▁▁▁▁ |
| previous_cancellations | 0 | 1 | 0.09 | 0.84 | 0.00 | 0.00 | 0.00 | 0 | 26 | ▇▁▁▁▁ |
| previous_bookings_not_canceled | 0 | 1 | 0.14 | 1.50 | 0.00 | 0.00 | 0.00 | 0 | 72 | ▇▁▁▁▁ |
| booking_changes | 0 | 1 | 0.22 | 0.65 | 0.00 | 0.00 | 0.00 | 0 | 21 | ▇▁▁▁▁ |
| days_in_waiting_list | 0 | 1 | 2.32 | 17.59 | 0.00 | 0.00 | 0.00 | 0 | 391 | ▇▁▁▁▁ |
| adr | 0 | 1 | 101.83 | 50.54 | -6.38 | 69.29 | 94.58 | 126 | 5400 | ▇▁▁▁▁ |
| required_car_parking_spaces | 0 | 1 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0 | 8 | ▇▁▁▁▁ |
| total_of_special_requests | 0 | 1 | 0.57 | 0.79 | 0.00 | 0.00 | 0.00 | 1 | 5 | ▇▁▁▁▁ |
| nights | 0 | 1 | 3.43 | 2.56 | 0.00 | 2.00 | 3.00 | 4 | 69 | ▇▁▁▁▁ |
| people | 0 | 1 | 1.97 | 0.72 | 0.00 | 2.00 | 2.00 | 2 | 55 | ▇▁▁▁▁ |
| children_babies | 0 | 1 | 0.11 | 0.41 | 0.00 | 0.00 | 0.00 | 0 | 10 | ▇▁▁▁▁ |
is canceled and hotel
Hmisc::describe(df1$hotel)
## df1$hotel
## n missing distinct
## 119390 0 2
##
## Value City Hotel Resort Hotel
## Frequency 79330 40060
## Proportion 0.664 0.336
table(df1$is_canceled,df1$hotel)
##
## City Hotel Resort Hotel
## 0 46228 28938
## 1 33102 11122
ggplot(df1, aes(x= is_canceled, group=hotel)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="is_canceled") +
facet_grid(~hotel) +
scale_y_continuous(labels = scales::percent)
Pair plot 1
df1 %>% select (is_canceled, is_waitlisted, is_repeated_guest, deposit_type) %>% ggpairs(mapping = aes(color=is_canceled))
Pair plot 2
df1 %>% select (is_canceled, has_bchanges, has_srequests, req_parking, has_pcancel) %>% ggpairs(mapping = aes(color=is_canceled))
month, is_canceled and hotel
Hmisc::describe(df1$arrival_date_year)
## df1$arrival_date_year
## n missing distinct
## 119390 0 3
##
## Value 2015 2016 2017
## Frequency 21996 56707 40687
## Proportion 0.184 0.475 0.341
table(df1$arrival_date_year, df1$arrival_date_month)
##
## April August December February January July June March May November
## 2015 0 3889 2920 0 0 2776 0 0 0 2340
## 2016 5428 5063 3860 3891 2248 4572 5292 4824 5478 4454
## 2017 5661 4925 0 4177 3681 5313 5647 4970 6313 0
##
## October September
## 2015 4957 5114
## 2016 6203 5394
## 2017 0 0
df1 %>%
mutate(arrival_date_month = factor(arrival_date_month,
levels = month.name
)) %>%
count(hotel, arrival_date_month, is_canceled) %>%
group_by(hotel, is_canceled) %>%
mutate(proportion = n / sum(n)) %>%
ggplot(aes(arrival_date_month, proportion, fill = is_canceled)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format()) +
facet_wrap(~hotel, nrow = 2) +
labs(
x = NULL,
y = "proportion of is_canceled",
fill = NULL
)
is_canceled, is_family and hotel
table(df1$is_family,df1$hotel)
##
## City Hotel Resort Hotel
## 0 73927 36131
## 1 5403 3929
df1 %>%
count(hotel,is_canceled, is_family) %>%
group_by(hotel, is_family) %>%
mutate(proportion = n / sum(n)) %>%
ggplot(aes(is_canceled, proportion, fill = is_family)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format()) +
facet_wrap(~hotel, nrow = 2) +
labs(
x = "is_canceled",
y = "proportion",
fill = "is_family"
)
distribution_channel, is_canceled and hotel
ggplot(df1, aes(x = distribution_channel, y = lead_time, fill = is_canceled)) + geom_boxplot(position = position_dodge()) +
facet_wrap(~hotel, nrow = 5) +
labs(
x = "distribution_channel",
y = "lead_time",
fill = "is_canceled"
)
market_segment, is_canceled and hotel
df1 %>%
count(hotel,market_segment, is_canceled) %>%
group_by(hotel, is_canceled) %>%
mutate(proportion = n / sum(n)) %>%
ggplot(aes(market_segment, proportion, fill = is_canceled)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format()) +
facet_wrap(~hotel, nrow = 2) +
labs(
x = "market_segment",
y = "proportion",
fill = "is_canceled"
)
customer_type and hotel
checkout = df1 %>% filter(reservation_status == 'Check-Out')
dim(checkout)
## [1] 75166 45
ggplot(checkout, aes(customer_type, fill=hotel)) + geom_bar(stat='count', position=position_dodge())
customer_type, avg. people and hotel
ggplot(checkout, aes(x= customer_type, y=people, fill= customer_type)) + geom_bar(stat="summary", fun="mean", position=position_dodge()) + facet_wrap(~hotel, nrow = 2)
customer_type, adr and hotel
ggplot(checkout, aes(x = customer_type, y = adr, fill = hotel)) + geom_boxplot(position = position_dodge())
market_segment and customer_type
ggplot(checkout) + geom_bar(aes(x= market_segment, fill= customer_type)) + facet_wrap(~hotel, nrow = 2)
Summary of country levels
country1 = count(df1,country)
summary(country1$n)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 12.50 670.73 74.75 48590.00
Select countries with >1500 count
subc = df1 %>%
group_by(country) %>%
filter(n() > 1500) %>%
as.data.frame()
dim(subc)
## [1] 104627 45
Hmisc::describe(subc$country)
## subc$country
## n missing distinct
## 104627 0 12
##
## lowest : BEL BRA CHE DEU ESP, highest: IRL ITA NLD PRT USA
##
## Value BEL BRA CHE DEU ESP FRA GBR IRL ITA NLD PRT
## Frequency 2342 2224 1730 7287 8568 10415 12129 3375 3766 2104 48590
## Proportion 0.022 0.021 0.017 0.070 0.082 0.100 0.116 0.032 0.036 0.020 0.464
##
## Value USA
## Frequency 2097
## Proportion 0.020
boxplot(subc$adr)
df2= subc
df2 = df2[df2$adr !=5400,]
df2=df2[df2$reserved_room_type !="P",]
df2=df2[df2$reserved_room_type !="L",]
df2=df2[df2$distribution_channel !="Undefined",]
df2=df2[df2$market_segment !="Undefined",]
df3a = df2[ df2$stays_in_weekend_nights> 0 | df2$stays_in_week_nights>0 ,]
dim(df3a)
## [1] 103938 45
df3b = df2[ df2$stays_in_weekend_nights== 0 & df2$stays_in_week_nights ==0 ,]
dim(df3b)
## [1] 675 45
write.csv(df3a, "df3a.csv", row.names = FALSE)