library(conflicted)
library(dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
library(ggplot2)
library(zoo)
hotel_bookings <- read.csv("challenge_datasets/hotel_bookings.csv")
as_tibble(hotel_bookings)
## # A tibble: 119,390 × 32
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## <chr> <int> <int> <int> <chr>
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## 7 Resort Hotel 0 0 2015 July
## 8 Resort Hotel 0 9 2015 July
## 9 Resort Hotel 1 85 2015 July
## 10 Resort Hotel 1 75 2015 July
## # ℹ 119,380 more rows
## # ℹ 27 more variables: arrival_date_week_number <int>,
## # arrival_date_day_of_month <int>, stays_in_weekend_nights <int>,
## # stays_in_week_nights <int>, adults <int>, children <int>, babies <int>,
## # meal <chr>, country <chr>, market_segment <chr>,
## # distribution_channel <chr>, is_repeated_guest <int>,
## # previous_cancellations <int>, previous_bookings_not_canceled <int>, …
In this section, we will do an elementary analysis to understand the data.
# Visualize the first few rows of the data
head(hotel_bookings)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1 27 1 0
## 2 27 1 0
## 3 27 1 0
## 4 27 1 0
## 5 27 1 0
## 6 27 1 0
## stays_in_week_nights adults children babies meal country market_segment
## 1 0 2 0 0 BB PRT Direct
## 2 0 2 0 0 BB PRT Direct
## 3 1 1 0 0 BB GBR Direct
## 4 1 1 0 0 BB GBR Corporate
## 5 2 2 0 0 BB GBR Online TA
## 6 2 2 0 0 BB GBR Online TA
## distribution_channel is_repeated_guest previous_cancellations
## 1 Direct 0 0
## 2 Direct 0 0
## 3 Direct 0 0
## 4 Corporate 0 0
## 5 TA/TO 0 0
## 6 TA/TO 0 0
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1 0 C C
## 2 0 C C
## 3 0 A C
## 4 0 A A
## 5 0 A A
## 6 0 A A
## booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1 3 No Deposit NULL NULL 0 Transient
## 2 4 No Deposit NULL NULL 0 Transient
## 3 0 No Deposit NULL NULL 0 Transient
## 4 0 No Deposit 304 NULL 0 Transient
## 5 0 No Deposit 240 NULL 0 Transient
## 6 0 No Deposit 240 NULL 0 Transient
## adr required_car_parking_spaces total_of_special_requests reservation_status
## 1 0 0 0 Check-Out
## 2 0 0 0 Check-Out
## 3 75 0 0 Check-Out
## 4 75 0 0 Check-Out
## 5 98 0 1 Check-Out
## 6 98 0 1 Check-Out
## reservation_status_date
## 1 2015-07-01
## 2 2015-07-01
## 3 2015-07-02
## 4 2015-07-02
## 5 2015-07-03
## 6 2015-07-03
The summary table provides a lot of insights on the hotel bookings data.
is_cancelled: The data reveals that about 37% of the bookings are cancelled.
lead_time: The number of days between the booking date and arrival date. The average lead time is around 104 days.
arrival_date_year: Bookings span from 2015 to 2017.
stays_in_weekend_nights and stays_in_week_nights: Guests stay, on average, 1 day over the weekend and 2.5 days during the week.
adults: Most bookings are for two adults, likely indicating couples or pairs of travelers.
children and babies: There are very few bookings with children or babies.
is_repeated_guest: Only about 3.19% of the bookings were made by repeat guests.
previous_cancellations: On average, there’s a low number of previous cancellations per guest, with a mean around 0.087.
booking_changes: Changes to bookings are relatively infrequent, with an average of 0.221 changes per booking.
adr: The average daily rate for a room is about 94.58, indicating the average revenue per booked room.
# Get a summary of the dataset
summary(hotel_bookings)
## hotel is_canceled lead_time arrival_date_year
## Length:119390 Min. :0.0000 Min. : 0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:119390 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.:16.00 1st Qu.: 8.0
## Mode :character Median :28.00 Median :16.0
## Mean :27.17 Mean :15.8
## 3rd Qu.:38.00 3rd Qu.:23.0
## Max. :53.00 Max. :31.0
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 Length:119390 Length:119390
## 1st Qu.: 0.0000 1st Qu.: 0.000000 Class :character Class :character
## Median : 0.0000 Median : 0.000000 Mode :character Mode :character
## Mean : 0.1039 Mean : 0.007949
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000
## Max. :10.0000 Max. :10.000000
## NA's :4
## market_segment distribution_channel is_repeated_guest
## Length:119390 Length:119390 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000
## Mean :0.03191
## 3rd Qu.:0.00000
## Max. :1.00000
##
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 Length:119390
## 1st Qu.: 0.00000 1st Qu.: 0.0000 Class :character
## Median : 0.00000 Median : 0.0000 Mode :character
## Mean : 0.08712 Mean : 0.1371
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :26.00000 Max. :72.0000
##
## assigned_room_type booking_changes deposit_type agent
## Length:119390 Min. : 0.0000 Length:119390 Length:119390
## Class :character 1st Qu.: 0.0000 Class :character Class :character
## Mode :character Median : 0.0000 Mode :character Mode :character
## Mean : 0.2211
## 3rd Qu.: 0.0000
## Max. :21.0000
##
## company days_in_waiting_list customer_type adr
## Length:119390 Min. : 0.000 Length:119390 Min. : -6.38
## Class :character 1st Qu.: 0.000 Class :character 1st Qu.: 69.29
## Mode :character Median : 0.000 Mode :character Median : 94.58
## Mean : 2.321 Mean : 101.83
## 3rd Qu.: 0.000 3rd Qu.: 126.00
## Max. :391.000 Max. :5400.00
##
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:119390
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :0.0000 Mode :character
## Mean :0.06252 Mean :0.5714
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :8.00000 Max. :5.0000
##
## reservation_status_date
## Length:119390
## Class :character
## Mode :character
##
##
##
##
# View the structure of the data
str(hotel_bookings)
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr "July" "July" "July" "July" ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr "BB" "BB" "BB" "BB" ...
## $ country : chr "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr "C" "C" "A" "A" ...
## $ assigned_room_type : chr "C" "C" "C" "A" ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr "NULL" "NULL" "NULL" "304" ...
## $ company : chr "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
We get to understand the hotel types using the following command.
# Get unique hotel types, countries
print(unique(hotel_bookings$hotel))
## [1] "Resort Hotel" "City Hotel"
print(unique(hotel_bookings$country))
## [1] "PRT" "GBR" "USA" "ESP" "IRL" "FRA" "NULL" "ROU" "NOR" "OMN"
## [11] "ARG" "POL" "DEU" "BEL" "CHE" "CN" "GRC" "ITA" "NLD" "DNK"
## [21] "RUS" "SWE" "AUS" "EST" "CZE" "BRA" "FIN" "MOZ" "BWA" "LUX"
## [31] "SVN" "ALB" "IND" "CHN" "MEX" "MAR" "UKR" "SMR" "LVA" "PRI"
## [41] "SRB" "CHL" "AUT" "BLR" "LTU" "TUR" "ZAF" "AGO" "ISR" "CYM"
## [51] "ZMB" "CPV" "ZWE" "DZA" "KOR" "CRI" "HUN" "ARE" "TUN" "JAM"
## [61] "HRV" "HKG" "IRN" "GEO" "AND" "GIB" "URY" "JEY" "CAF" "CYP"
## [71] "COL" "GGY" "KWT" "NGA" "MDV" "VEN" "SVK" "FJI" "KAZ" "PAK"
## [81] "IDN" "LBN" "PHL" "SEN" "SYC" "AZE" "BHR" "NZL" "THA" "DOM"
## [91] "MKD" "MYS" "ARM" "JPN" "LKA" "CUB" "CMR" "BIH" "MUS" "COM"
## [101] "SUR" "UGA" "BGR" "CIV" "JOR" "SYR" "SGP" "BDI" "SAU" "VNM"
## [111] "PLW" "QAT" "EGY" "PER" "MLT" "MWI" "ECU" "MDG" "ISL" "UZB"
## [121] "NPL" "BHS" "MAC" "TGO" "TWN" "DJI" "STP" "KNA" "ETH" "IRQ"
## [131] "HND" "RWA" "KHM" "MCO" "BGD" "IMN" "TJK" "NIC" "BEN" "VGB"
## [141] "TZA" "GAB" "GHA" "TMP" "GLP" "KEN" "LIE" "GNB" "MNE" "UMI"
## [151] "MYT" "FRO" "MMR" "PAN" "BFA" "LBY" "MLI" "NAM" "BOL" "PRY"
## [161] "BRB" "ABW" "AIA" "SLV" "DMA" "PYF" "GUY" "LCA" "ATA" "GTM"
## [171] "ASM" "MRT" "NCL" "KIR" "SDN" "ATF" "SLE" "LAO"
The hotel_bookings dataset seems to be in a tidy format.
This is because, each variable forms a column, and all the variables are
distinct, such as hotel, is_cancelled,
lead_time, etc. Each row in the dataset represents a single
observation, observation in our case is a booking record.
The reservation_status_date column is an object type but should be in a Date format for any time series analysis or date-based filtering.
# Convert reservation_status_date to Date format
hotel_bookings <- hotel_bookings %>%
mutate(reservation_status_date = as.Date(reservation_status_date, format = "%Y-%m-%d"))
Columns such as hotel, arrival_date_month, meal, country, market_segment, distribution_channel, reserved_room_type, assigned_room_type, deposit_type, customer_type, and reservation_status are of object type. These might be better represented as categorical variables (factors) if used for grouping or statistical analysis.
month_levels <- c('January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December')
hotel_bookings <- hotel_bookings %>%
mutate(
arrival_date_month = factor(arrival_date_month, levels = month_levels),
meal = factor(meal),
country = factor(country),
market_segment = factor(market_segment),
distribution_channel = factor(distribution_channel),
reserved_room_type = factor(reserved_room_type),
assigned_room_type = factor(assigned_room_type),
deposit_type = factor(deposit_type),
customer_type = factor(customer_type),
reservation_status = factor(reservation_status)
)
Columns like agent, and company are floats, indicating missing values (NaNs). We want to handle these missing values by filling them with a default value, like 0.
# Handle 'agent' and 'company' columns where "NULL" represents missing values
hotel_bookings$agent[hotel_bookings$agent == "NULL"] <- NA
hotel_bookings$company[hotel_bookings$company == "NULL"] <- NA
# Convert 'agent' and 'company' to numeric, assuming NA where they were "NULL"
hotel_bookings <- hotel_bookings %>%
mutate(
agent = as.numeric(agent),
company = as.numeric(company)
)
# Replace NA values with 0 for 'agent' and 'company'
hotel_bookings <- hotel_bookings %>%
mutate(
agent = ifelse(is.na(agent), 0, agent),
company = ifelse(is.na(company), 0, company)
)
`stays_in_weekend_nights and stays_in_week_nights can be combined into a single total_stay_nights column.
# Create a total_stay_nights variable
hotel_bookings <- hotel_bookings %>%
mutate(total_stay_nights = stays_in_weekend_nights + stays_in_week_nights)
The dataset now includes these transformations, making it more suitable for further analysis. As a sanity check we will check the structure of the dataset once again.
# Sanity check: Check the structure of the dataset after mutation
str(hotel_bookings)
## 'data.frame': 119390 obs. of 33 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : Factor w/ 12 levels "January","February",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : Factor w/ 5 levels "BB","FB","HB",..: 1 1 1 1 1 1 1 2 1 3 ...
## $ country : Factor w/ 178 levels "ABW","AGO","AIA",..: 137 137 60 60 60 60 137 137 137 137 ...
## $ market_segment : Factor w/ 8 levels "Aviation","Complementary",..: 4 4 4 3 7 7 4 4 7 6 ...
## $ distribution_channel : Factor w/ 5 levels "Corporate","Direct",..: 2 2 2 1 4 4 2 2 4 4 ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : Factor w/ 10 levels "A","B","C","D",..: 3 3 1 1 1 1 3 3 1 4 ...
## $ assigned_room_type : Factor w/ 12 levels "A","B","C","D",..: 3 3 3 1 1 1 3 3 1 4 ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ agent : num 0 0 0 304 240 240 0 303 240 15 ...
## $ company : num 0 0 0 0 0 0 0 0 0 0 ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : Factor w/ 4 levels "Contract","Group",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : Factor w/ 3 levels "Canceled","Check-Out",..: 2 2 2 2 2 2 2 2 1 1 ...
## $ reservation_status_date : Date, format: "2015-07-01" "2015-07-01" ...
## $ total_stay_nights : int 0 0 1 1 2 2 2 2 3 3 ...