Loading the dataset

The third data set is Hotel booking demand from Kaggle. https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand/code

library(tidyverse)
library(lubridate)
hotel <- read_csv("~/data_607_project_2/hotel_bookings.csv")

glimpse(hotel)
## Rows: 119,390
## Columns: 32
## $ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
## $ is_canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year              <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
## $ arrival_date_week_number       <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
## $ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
## $ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240", "…
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type                  <chr> "Transient", "Transient", "Transient", …
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
## $ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
summary(hotel)
##     hotel            is_canceled       lead_time   arrival_date_year
##  Length:119390      Min.   :0.0000   Min.   :  0   Min.   :2015     
##  Class :character   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
##  Mode  :character   Median :0.0000   Median : 69   Median :2016     
##                     Mean   :0.3704   Mean   :104   Mean   :2016     
##                     3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
##                     Max.   :1.0000   Max.   :737   Max.   :2017     
##                                                                     
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  Length:119390      Min.   : 1.00            Min.   : 1.0             
##  Class :character   1st Qu.:16.00            1st Qu.: 8.0             
##  Mode  :character   Median :28.00            Median :16.0             
##                     Mean   :27.17            Mean   :15.8             
##                     3rd Qu.:38.00            3rd Qu.:23.0             
##                     Max.   :53.00            Max.   :31.0             
##                                                                       
##  stays_in_weekend_nights stays_in_week_nights     adults      
##  Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
##  1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
##  Median : 1.0000         Median : 2.0         Median : 2.000  
##  Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
##  3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
##  Max.   :19.0000         Max.   :50.0         Max.   :55.000  
##                                                               
##     children           babies              meal             country         
##  Min.   : 0.0000   Min.   : 0.000000   Length:119390      Length:119390     
##  1st Qu.: 0.0000   1st Qu.: 0.000000   Class :character   Class :character  
##  Median : 0.0000   Median : 0.000000   Mode  :character   Mode  :character  
##  Mean   : 0.1039   Mean   : 0.007949                                        
##  3rd Qu.: 0.0000   3rd Qu.: 0.000000                                        
##  Max.   :10.0000   Max.   :10.000000                                        
##  NA's   :4                                                                  
##  market_segment     distribution_channel is_repeated_guest
##  Length:119390      Length:119390        Min.   :0.00000  
##  Class :character   Class :character     1st Qu.:0.00000  
##  Mode  :character   Mode  :character     Median :0.00000  
##                                          Mean   :0.03191  
##                                          3rd Qu.:0.00000  
##                                          Max.   :1.00000  
##                                                           
##  previous_cancellations previous_bookings_not_canceled reserved_room_type
##  Min.   : 0.00000       Min.   : 0.0000                Length:119390     
##  1st Qu.: 0.00000       1st Qu.: 0.0000                Class :character  
##  Median : 0.00000       Median : 0.0000                Mode  :character  
##  Mean   : 0.08712       Mean   : 0.1371                                  
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                                  
##  Max.   :26.00000       Max.   :72.0000                                  
##                                                                          
##  assigned_room_type booking_changes   deposit_type          agent          
##  Length:119390      Min.   : 0.0000   Length:119390      Length:119390     
##  Class :character   1st Qu.: 0.0000   Class :character   Class :character  
##  Mode  :character   Median : 0.0000   Mode  :character   Mode  :character  
##                     Mean   : 0.2211                                        
##                     3rd Qu.: 0.0000                                        
##                     Max.   :21.0000                                        
##                                                                            
##    company          days_in_waiting_list customer_type           adr         
##  Length:119390      Min.   :  0.000      Length:119390      Min.   :  -6.38  
##  Class :character   1st Qu.:  0.000      Class :character   1st Qu.:  69.29  
##  Mode  :character   Median :  0.000      Mode  :character   Median :  94.58  
##                     Mean   :  2.321                         Mean   : 101.83  
##                     3rd Qu.:  0.000                         3rd Qu.: 126.00  
##                     Max.   :391.000                         Max.   :5400.00  
##                                                                              
##  required_car_parking_spaces total_of_special_requests reservation_status
##  Min.   :0.00000             Min.   :0.0000            Length:119390     
##  1st Qu.:0.00000             1st Qu.:0.0000            Class :character  
##  Median :0.00000             Median :0.0000            Mode  :character  
##  Mean   :0.06252             Mean   :0.5714                              
##  3rd Qu.:0.00000             3rd Qu.:1.0000                              
##  Max.   :8.00000             Max.   :5.0000                              
##                                                                          
##  reservation_status_date
##  Min.   :2014-10-17     
##  1st Qu.:2016-02-01     
##  Median :2016-08-07     
##  Mean   :2016-07-30     
##  3rd Qu.:2017-02-08     
##  Max.   :2017-09-14     
## 

Duplicated and missing value in this dataset

31994 rows are duplicated and there are 4 missing value in children column.

# Find out number of missing missing value
# Find out number of duplicated value

hotel |> 
  summarise(across(everything(), ~ sum(is.na(.)))) |>
  t()
##                                [,1]
## hotel                             0
## is_canceled                       0
## lead_time                         0
## arrival_date_year                 0
## arrival_date_month                0
## arrival_date_week_number          0
## arrival_date_day_of_month         0
## stays_in_weekend_nights           0
## stays_in_week_nights              0
## adults                            0
## children                          4
## babies                            0
## meal                              0
## country                           0
## market_segment                    0
## distribution_channel              0
## is_repeated_guest                 0
## previous_cancellations            0
## previous_bookings_not_canceled    0
## reserved_room_type                0
## assigned_room_type                0
## booking_changes                   0
## deposit_type                      0
## agent                             0
## company                           0
## days_in_waiting_list              0
## customer_type                     0
## adr                               0
## required_car_parking_spaces       0
## total_of_special_requests         0
## reservation_status                0
## reservation_status_date           0
nrow(hotel[duplicated(hotel),])
## [1] 31994

Data cleaning and transformation

# drop rows contains NA value
# drop duplicates rows
# remove column: 
# transform arrival_date_month into numeric value
# merge these three column in to arrival_date
# drop the column arrival_date_week_number
# Need at least 1 adult to reserve a hotel. Any rows where number of adults equal to 0 need to be eliminated.
# Merge children and babies into new column 'kids'
# drop the column children, babies
# Merge stays_in_weekend_nights, stays_in_week_nights into new column 'total_nights'

hotel2 <- hotel |> 
              drop_na() |> 
              distinct() |> 
              select(-c(agent, company, adr)) |> 
              mutate(arrival_date_month = match(tolower(arrival_date_month), tolower(month.name))) |> 
              mutate(arrival_date = as.Date(paste(arrival_date_year, arrival_date_month, arrival_date_day_of_month, sep = "-"))) |> 
              select(-arrival_date_week_number) |> 
              filter(adults != 0) |> 
              mutate(kids = children + babies) |>
              select(-c(children, babies)) |> 
              relocate(kids, .after = adults) |> 
              mutate(total_nights = stays_in_weekend_nights + stays_in_week_nights) |>
              select(-c(stays_in_weekend_nights, stays_in_week_nights)) |> 
              relocate(total_nights, .after = arrival_date_day_of_month)

hotel2
## # A tibble: 87,007 × 27
##    hotel        is_canceled lead_time arrival_date_year arrival_date_month
##    <chr>              <dbl>     <dbl>             <dbl>              <int>
##  1 Resort Hotel           0       342              2015                  7
##  2 Resort Hotel           0       737              2015                  7
##  3 Resort Hotel           0         7              2015                  7
##  4 Resort Hotel           0        13              2015                  7
##  5 Resort Hotel           0        14              2015                  7
##  6 Resort Hotel           0         0              2015                  7
##  7 Resort Hotel           0         9              2015                  7
##  8 Resort Hotel           1        85              2015                  7
##  9 Resort Hotel           1        75              2015                  7
## 10 Resort Hotel           1        23              2015                  7
## # ℹ 86,997 more rows
## # ℹ 22 more variables: arrival_date_day_of_month <dbl>, total_nights <dbl>,
## #   adults <dbl>, kids <dbl>, meal <chr>, country <chr>, market_segment <chr>,
## #   distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, days_in_waiting_list <dbl>, customer_type <chr>, …

Visualize the data

# Compare Which Hotel Type is more popular
hotel2 |>
  group_by(hotel) |> 
  summarise(count = n()) |> 
  ggplot(aes(x = hotel, y = count)) +
  geom_bar(stat = "identity") + 
  geom_text(aes(label = count), vjust = -0.5) +
  labs(x = "Hotel Type", y = "Number of Reservations")

# How many percentage of reservations got cancelled?

hotel2 |>
  group_by(hotel, is_canceled) |> 
  summarise(count = n()) |> 
  mutate(percentage = count / sum(count)) |> 
  ggplot(aes(x = hotel, y = percentage, fill = ifelse(is_canceled == 1, "canceled", "not canceled"))) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = scales::percent(percentage)), position = position_dodge(width = 0.9), vjust = -0.5) +
  labs(title = "Percentage of Canceled Reservations By Hotel", x = "Hotel", y = "Percentage", fill="Type")+
  scale_y_continuous(labels = scales::percent)

# Top 10 Countries of Origin for Hotel Visitors

hotel2 |>
  group_by(country) |> 
  summarise(count = n()) |> 
  top_n(10) |> 
  ggplot(aes(x = reorder(country, count), y = count, fill = country)) +
  geom_bar(stat = "identity") + 
  coord_flip() +
  labs(title = "Top 10 Countries of Origin for Hotel Visitors",
       x = "Country",
       y = "Number of Guests")