The third data set is Hotel booking demand from Kaggle. https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand/code
library(tidyverse)
library(lubridate)
hotel <- read_csv("~/data_607_project_2/hotel_bookings.csv")
glimpse(hotel)
## Rows: 119,390
## Columns: 32
## $ hotel <chr> "Resort Hotel", "Resort Hotel", "Resort…
## $ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month <chr> "July", "July", "July", "July", "July",…
## $ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
## $ country <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
## $ market_segment <chr> "Direct", "Direct", "Direct", "Corporat…
## $ distribution_channel <chr> "Direct", "Direct", "Direct", "Corporat…
## $ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
## $ assigned_room_type <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
## $ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type <chr> "No Deposit", "No Deposit", "No Deposit…
## $ agent <chr> "NULL", "NULL", "NULL", "304", "240", "…
## $ company <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
## $ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type <chr> "Transient", "Transient", "Transient", …
## $ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status <chr> "Check-Out", "Check-Out", "Check-Out", …
## $ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
summary(hotel)
## hotel is_canceled lead_time arrival_date_year
## Length:119390 Min. :0.0000 Min. : 0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:119390 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.:16.00 1st Qu.: 8.0
## Mode :character Median :28.00 Median :16.0
## Mean :27.17 Mean :15.8
## 3rd Qu.:38.00 3rd Qu.:23.0
## Max. :53.00 Max. :31.0
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 Length:119390 Length:119390
## 1st Qu.: 0.0000 1st Qu.: 0.000000 Class :character Class :character
## Median : 0.0000 Median : 0.000000 Mode :character Mode :character
## Mean : 0.1039 Mean : 0.007949
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000
## Max. :10.0000 Max. :10.000000
## NA's :4
## market_segment distribution_channel is_repeated_guest
## Length:119390 Length:119390 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000
## Mean :0.03191
## 3rd Qu.:0.00000
## Max. :1.00000
##
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 Length:119390
## 1st Qu.: 0.00000 1st Qu.: 0.0000 Class :character
## Median : 0.00000 Median : 0.0000 Mode :character
## Mean : 0.08712 Mean : 0.1371
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :26.00000 Max. :72.0000
##
## assigned_room_type booking_changes deposit_type agent
## Length:119390 Min. : 0.0000 Length:119390 Length:119390
## Class :character 1st Qu.: 0.0000 Class :character Class :character
## Mode :character Median : 0.0000 Mode :character Mode :character
## Mean : 0.2211
## 3rd Qu.: 0.0000
## Max. :21.0000
##
## company days_in_waiting_list customer_type adr
## Length:119390 Min. : 0.000 Length:119390 Min. : -6.38
## Class :character 1st Qu.: 0.000 Class :character 1st Qu.: 69.29
## Mode :character Median : 0.000 Mode :character Median : 94.58
## Mean : 2.321 Mean : 101.83
## 3rd Qu.: 0.000 3rd Qu.: 126.00
## Max. :391.000 Max. :5400.00
##
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:119390
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :0.0000 Mode :character
## Mean :0.06252 Mean :0.5714
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :8.00000 Max. :5.0000
##
## reservation_status_date
## Min. :2014-10-17
## 1st Qu.:2016-02-01
## Median :2016-08-07
## Mean :2016-07-30
## 3rd Qu.:2017-02-08
## Max. :2017-09-14
##
31994 rows are duplicated and there are 4 missing value in children column.
# Find out number of missing missing value
# Find out number of duplicated value
hotel |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
t()
## [,1]
## hotel 0
## is_canceled 0
## lead_time 0
## arrival_date_year 0
## arrival_date_month 0
## arrival_date_week_number 0
## arrival_date_day_of_month 0
## stays_in_weekend_nights 0
## stays_in_week_nights 0
## adults 0
## children 4
## babies 0
## meal 0
## country 0
## market_segment 0
## distribution_channel 0
## is_repeated_guest 0
## previous_cancellations 0
## previous_bookings_not_canceled 0
## reserved_room_type 0
## assigned_room_type 0
## booking_changes 0
## deposit_type 0
## agent 0
## company 0
## days_in_waiting_list 0
## customer_type 0
## adr 0
## required_car_parking_spaces 0
## total_of_special_requests 0
## reservation_status 0
## reservation_status_date 0
nrow(hotel[duplicated(hotel),])
## [1] 31994
# drop rows contains NA value
# drop duplicates rows
# remove column:
# transform arrival_date_month into numeric value
# merge these three column in to arrival_date
# drop the column arrival_date_week_number
# Need at least 1 adult to reserve a hotel. Any rows where number of adults equal to 0 need to be eliminated.
# Merge children and babies into new column 'kids'
# drop the column children, babies
# Merge stays_in_weekend_nights, stays_in_week_nights into new column 'total_nights'
hotel2 <- hotel |>
drop_na() |>
distinct() |>
select(-c(agent, company, adr)) |>
mutate(arrival_date_month = match(tolower(arrival_date_month), tolower(month.name))) |>
mutate(arrival_date = as.Date(paste(arrival_date_year, arrival_date_month, arrival_date_day_of_month, sep = "-"))) |>
select(-arrival_date_week_number) |>
filter(adults != 0) |>
mutate(kids = children + babies) |>
select(-c(children, babies)) |>
relocate(kids, .after = adults) |>
mutate(total_nights = stays_in_weekend_nights + stays_in_week_nights) |>
select(-c(stays_in_weekend_nights, stays_in_week_nights)) |>
relocate(total_nights, .after = arrival_date_day_of_month)
hotel2
## # A tibble: 87,007 × 27
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Resort Hotel 0 342 2015 7
## 2 Resort Hotel 0 737 2015 7
## 3 Resort Hotel 0 7 2015 7
## 4 Resort Hotel 0 13 2015 7
## 5 Resort Hotel 0 14 2015 7
## 6 Resort Hotel 0 0 2015 7
## 7 Resort Hotel 0 9 2015 7
## 8 Resort Hotel 1 85 2015 7
## 9 Resort Hotel 1 75 2015 7
## 10 Resort Hotel 1 23 2015 7
## # ℹ 86,997 more rows
## # ℹ 22 more variables: arrival_date_day_of_month <dbl>, total_nights <dbl>,
## # adults <dbl>, kids <dbl>, meal <chr>, country <chr>, market_segment <chr>,
## # distribution_channel <chr>, is_repeated_guest <dbl>,
## # previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## # reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## # deposit_type <chr>, days_in_waiting_list <dbl>, customer_type <chr>, …
# Compare Which Hotel Type is more popular
hotel2 |>
group_by(hotel) |>
summarise(count = n()) |>
ggplot(aes(x = hotel, y = count)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), vjust = -0.5) +
labs(x = "Hotel Type", y = "Number of Reservations")
# How many percentage of reservations got cancelled?
hotel2 |>
group_by(hotel, is_canceled) |>
summarise(count = n()) |>
mutate(percentage = count / sum(count)) |>
ggplot(aes(x = hotel, y = percentage, fill = ifelse(is_canceled == 1, "canceled", "not canceled"))) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = scales::percent(percentage)), position = position_dodge(width = 0.9), vjust = -0.5) +
labs(title = "Percentage of Canceled Reservations By Hotel", x = "Hotel", y = "Percentage", fill="Type")+
scale_y_continuous(labels = scales::percent)
# Top 10 Countries of Origin for Hotel Visitors
hotel2 |>
group_by(country) |>
summarise(count = n()) |>
top_n(10) |>
ggplot(aes(x = reorder(country, count), y = count, fill = country)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Countries of Origin for Hotel Visitors",
x = "Country",
y = "Number of Guests")