Load the dataset

Load and Prepare the Data

# Loading the dataset here
hotel_data <- read.csv("G:/semester_1/4_Statistics_R/syllabus/lab/week11/hotel_bookings.csv")


# Subset the data
subset_data <- hotel_data %>% 
  select(required_car_parking_spaces, 
         lead_time, adults, children, stays_in_weekend_nights, stays_in_week_nights)

Data Preparation and Exploration

# Checking the  missing values
missing_values <- sapply(hotel_data, function(x) sum(is.na(x)))

# Removing rows with missing values
hotel_data <- hotel_data %>% filter(!apply(is.na(.), 1, any))

Prepare Categorical Variables

categorical_vars <- c("hotel", "meal", "country", "market_segment", "distribution_channel", "reserved_room_type", "assigned_room_type", "deposit_type", "customer_type", "reservation_status")
hotel_data[categorical_vars] <- lapply(hotel_data[categorical_vars], as.factor)

# Encoding the date variables
hotel_data$arrival_date_month <- factor(hotel_data$arrival_date_month, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))

Data Summary

# Data summary
summary(hotel_data)
##           hotel        is_canceled       lead_time   arrival_date_year
##  City Hotel  :79326   Min.   :0.0000   Min.   :  0   Min.   :2015     
##  Resort Hotel:40060   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
##                       Median :0.0000   Median : 69   Median :2016     
##                       Mean   :0.3704   Mean   :104   Mean   :2016     
##                       3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
##                       Max.   :1.0000   Max.   :737   Max.   :2017     
##                                                                       
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  August :13873      Min.   : 1.00            Min.   : 1.0             
##  July   :12661      1st Qu.:16.00            1st Qu.: 8.0             
##  May    :11791      Median :28.00            Median :16.0             
##  October:11160      Mean   :27.16            Mean   :15.8             
##  April  :11089      3rd Qu.:38.00            3rd Qu.:23.0             
##  June   :10939      Max.   :53.00            Max.   :31.0             
##  (Other):47873                                                        
##  stays_in_weekend_nights stays_in_week_nights     adults      
##  Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
##  1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
##  Median : 1.0000         Median : 2.0         Median : 2.000  
##  Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
##  3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
##  Max.   :19.0000         Max.   :50.0         Max.   :55.000  
##                                                               
##     children           babies                 meal          country     
##  Min.   : 0.0000   Min.   : 0.000000   BB       :92306   PRT    :48586  
##  1st Qu.: 0.0000   1st Qu.: 0.000000   FB       :  798   GBR    :12129  
##  Median : 0.0000   Median : 0.000000   HB       :14463   FRA    :10415  
##  Mean   : 0.1039   Mean   : 0.007949   SC       :10650   ESP    : 8568  
##  3rd Qu.: 0.0000   3rd Qu.: 0.000000   Undefined: 1169   DEU    : 7287  
##  Max.   :10.0000   Max.   :10.000000                     ITA    : 3766  
##                                                          (Other):28635  
##        market_segment  distribution_channel is_repeated_guest
##  Aviation     :  237   Corporate: 6677      Min.   :0.00000  
##  Complementary:  743   Direct   :14645      1st Qu.:0.00000  
##  Corporate    : 5295   GDS      :  193      Median :0.00000  
##  Direct       :12605   TA/TO    :97870      Mean   :0.03191  
##  Groups       :19811   Undefined:    1      3rd Qu.:0.00000  
##  Offline TA/TO:24219                        Max.   :1.00000  
##  Online TA    :56476                                         
##  previous_cancellations previous_bookings_not_canceled reserved_room_type
##  Min.   : 0.00000       Min.   : 0.0000                A      :85994     
##  1st Qu.: 0.00000       1st Qu.: 0.0000                D      :19201     
##  Median : 0.00000       Median : 0.0000                E      : 6535     
##  Mean   : 0.08712       Mean   : 0.1371                F      : 2897     
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                G      : 2094     
##  Max.   :26.00000       Max.   :72.0000                B      : 1114     
##                                                        (Other): 1551     
##  assigned_room_type booking_changes       deposit_type       agent          
##  A      :74053      Min.   : 0.0000   No Deposit:104637   Length:119386     
##  D      :25322      1st Qu.: 0.0000   Non Refund: 14587   Class :character  
##  E      : 7806      Median : 0.0000   Refundable:   162   Mode  :character  
##  F      : 3751      Mean   : 0.2211                                         
##  G      : 2553      3rd Qu.: 0.0000                                         
##  C      : 2375      Max.   :21.0000                                         
##  (Other): 3526                                                              
##    company          days_in_waiting_list         customer_type  
##  Length:119386      Min.   :  0.000      Contract       : 4076  
##  Class :character   1st Qu.:  0.000      Group          :  577  
##  Mode  :character   Median :  0.000      Transient      :89613  
##                     Mean   :  2.321      Transient-Party:25120  
##                     3rd Qu.:  0.000                             
##                     Max.   :391.000                             
##                                                                 
##       adr          required_car_parking_spaces total_of_special_requests
##  Min.   :  -6.38   Min.   :0.00000             Min.   :0.0000           
##  1st Qu.:  69.29   1st Qu.:0.00000             1st Qu.:0.0000           
##  Median :  94.59   Median :0.00000             Median :0.0000           
##  Mean   : 101.83   Mean   :0.06252             Mean   :0.5713           
##  3rd Qu.: 126.00   3rd Qu.:0.00000             3rd Qu.:1.0000           
##  Max.   :5400.00   Max.   :8.00000             Max.   :5.0000           
##                                                                         
##  reservation_status reservation_status_date
##  Canceled :43013    Length:119386          
##  Check-Out:75166    Class :character       
##  No-Show  : 1207    Mode  :character       
##                                            
##                                            
##                                            
## 
# Checking the first few rows of the dataset
head(subset_data)
##   required_car_parking_spaces lead_time adults children stays_in_weekend_nights
## 1                           0       342      2        0                       0
## 2                           0       737      2        0                       0
## 3                           0         7      1        0                       0
## 4                           0        13      1        0                       0
## 5                           0        14      2        0                       0
## 6                           0        14      2        0                       0
##   stays_in_week_nights
## 1                    0
## 2                    0
## 3                    1
## 4                    1
## 5                    2
## 6                    2

Model Building and Analysis

# Building a linear model with 'required_car_parking_spaces' as the response variable
model <- lm(required_car_parking_spaces ~ lead_time + adults + children + stays_in_weekend_nights + stays_in_week_nights, data = hotel_data)

Model Summary

summary(model)
## 
## Call:
## lm(formula = required_car_parking_spaces ~ lead_time + adults + 
##     children + stays_in_weekend_nights + stays_in_week_nights, 
##     data = hotel_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6388 -0.0801 -0.0667 -0.0367  7.9507 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              6.861e-02  2.493e-03  27.527  < 2e-16 ***
## lead_time               -2.666e-04  6.724e-06 -39.648  < 2e-16 ***
## adults                   1.211e-02  1.229e-03   9.846  < 2e-16 ***
## children                 3.184e-02  1.771e-03  17.982  < 2e-16 ***
## stays_in_weekend_nights -2.727e-03  8.147e-04  -3.347 0.000816 ***
## stays_in_week_nights    -6.439e-04  4.305e-04  -1.496 0.134752    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2432 on 119380 degrees of freedom
## Multiple R-squared:  0.01719,    Adjusted R-squared:  0.01715 
## F-statistic: 417.6 on 5 and 119380 DF,  p-value: < 2.2e-16

Diagnostic Tools

par(mfrow=c(2, 2))  # Create a 2x2 grid for diagnostic plots
plot(model)

Residual Analysis

# Here I am performing the Anderson-Darling test on residuals
library(nortest)

ad_test_result <- ad.test(model$residuals)
ad_test_result
## 
##  Anderson-Darling normality test
## 
## data:  model$residuals
## A = 31076, p-value < 2.2e-16

Kolmogorov-Smirnov Test

# Performing the Kolmogorov-Smirnov test on residuals
ks_test_result <- ks.test(model$residuals, "pnorm")
## Warning in ks.test.default(model$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test
ks_test_result
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  model$residuals
## D = 0.43751, p-value < 2.2e-16
## alternative hypothesis: two-sided

Highlighting Issues

# Highlighting Issues
if (ad_test_result$p.value < 0.05) {
  cat("The Anderson-Darling test suggests that the residuals may not follow a normal distribution.\n")
}
## The Anderson-Darling test suggests that the residuals may not follow a normal distribution.
if (ks_test_result$p.value < 0.05) {
  cat("The Kolmogorov-Smirnov test indicates potential departures from normality in the residuals.\n")
}
## The Kolmogorov-Smirnov test indicates potential departures from normality in the residuals.

Interpret a Coefficient

coefficient_to_interpret <- coef(model)['adults']
cat("For every additional adult staying in a room, the expected number of required parking spaces increases by ", round(coefficient_to_interpret, 2), " on average, holding other variables constant.\n")
## For every additional adult staying in a room, the expected number of required parking spaces increases by  0.01  on average, holding other variables constant.
# Performing the Kolmogorov-Smirnov test on residuals
ks.test(model$residuals, "pnorm")
## Warning in ks.test.default(model$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  model$residuals
## D = 0.43751, p-value < 2.2e-16
## alternative hypothesis: two-sided

Further Questions and Considerations