Load the dataset

Load and Prepare the Data

# Loading the dataset here
hotel_data <- read.csv("G:/semester_1/4_Statistics_R/syllabus/lab/week11/hotel_bookings.csv")


# Subset the data
subset_data <- hotel_data %>% 
  select(required_car_parking_spaces, 
         lead_time, adults, children, stays_in_weekend_nights, stays_in_week_nights)

Data Preparation and Exploration

Check for Missing Values

# Checking the  missing values
missing_values <- sapply(hotel_data, function(x) sum(is.na(x)))

# Removing rows with missing values
hotel_data <- hotel_data %>% filter(!apply(is.na(.), 1, any))

Prepare Categorical Variables

categorical_vars <- c("hotel", "meal", "country", "market_segment", "distribution_channel", "reserved_room_type", "assigned_room_type", "deposit_type", "customer_type", "reservation_status")
hotel_data[categorical_vars] <- lapply(hotel_data[categorical_vars], as.factor)

# Encoding the date variables
hotel_data$arrival_date_month <- factor(hotel_data$arrival_date_month, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))

Data Summary

# Data summary
summary(hotel_data)

##           hotel        is_canceled       lead_time   arrival_date_year
##  City Hotel  :79326   Min.   :0.0000   Min.   :  0   Min.   :2015     
##  Resort Hotel:40060   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
##                       Median :0.0000   Median : 69   Median :2016     
##                       Mean   :0.3704   Mean   :104   Mean   :2016     
##                       3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
##                       Max.   :1.0000   Max.   :737   Max.   :2017     
##                                                                       
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  August :13873      Min.   : 1.00            Min.   : 1.0             
##  July   :12661      1st Qu.:16.00            1st Qu.: 8.0             
##  May    :11791      Median :28.00            Median :16.0             
##  October:11160      Mean   :27.16            Mean   :15.8             
##  April  :11089      3rd Qu.:38.00            3rd Qu.:23.0             
##  June   :10939      Max.   :53.00            Max.   :31.0             
##  (Other):47873                                                        
##  stays_in_weekend_nights stays_in_week_nights     adults      
##  Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
##  1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
##  Median : 1.0000         Median : 2.0         Median : 2.000  
##  Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
##  3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
##  Max.   :19.0000         Max.   :50.0         Max.   :55.000  
##                                                               
##     children           babies                 meal          country     
##  Min.   : 0.0000   Min.   : 0.000000   BB       :92306   PRT    :48586  
##  1st Qu.: 0.0000   1st Qu.: 0.000000   FB       :  798   GBR    :12129  
##  Median : 0.0000   Median : 0.000000   HB       :14463   FRA    :10415  
##  Mean   : 0.1039   Mean   : 0.007949   SC       :10650   ESP    : 8568  
##  3rd Qu.: 0.0000   3rd Qu.: 0.000000   Undefined: 1169   DEU    : 7287  
##  Max.   :10.0000   Max.   :10.000000                     ITA    : 3766  
##                                                          (Other):28635  
##        market_segment  distribution_channel is_repeated_guest
##  Aviation     :  237   Corporate: 6677      Min.   :0.00000  
##  Complementary:  743   Direct   :14645      1st Qu.:0.00000  
##  Corporate    : 5295   GDS      :  193      Median :0.00000  
##  Direct       :12605   TA/TO    :97870      Mean   :0.03191  
##  Groups       :19811   Undefined:    1      3rd Qu.:0.00000  
##  Offline TA/TO:24219                        Max.   :1.00000  
##  Online TA    :56476                                         
##  previous_cancellations previous_bookings_not_canceled reserved_room_type
##  Min.   : 0.00000       Min.   : 0.0000                A      :85994     
##  1st Qu.: 0.00000       1st Qu.: 0.0000                D      :19201     
##  Median : 0.00000       Median : 0.0000                E      : 6535     
##  Mean   : 0.08712       Mean   : 0.1371                F      : 2897     
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                G      : 2094     
##  Max.   :26.00000       Max.   :72.0000                B      : 1114     
##                                                        (Other): 1551     
##  assigned_room_type booking_changes       deposit_type       agent          
##  A      :74053      Min.   : 0.0000   No Deposit:104637   Length:119386     
##  D      :25322      1st Qu.: 0.0000   Non Refund: 14587   Class :character  
##  E      : 7806      Median : 0.0000   Refundable:   162   Mode  :character  
##  F      : 3751      Mean   : 0.2211                                         
##  G      : 2553      3rd Qu.: 0.0000                                         
##  C      : 2375      Max.   :21.0000                                         
##  (Other): 3526                                                              
##    company          days_in_waiting_list         customer_type  
##  Length:119386      Min.   :  0.000      Contract       : 4076  
##  Class :character   1st Qu.:  0.000      Group          :  577  
##  Mode  :character   Median :  0.000      Transient      :89613  
##                     Mean   :  2.321      Transient-Party:25120  
##                     3rd Qu.:  0.000                             
##                     Max.   :391.000                             
##                                                                 
##       adr          required_car_parking_spaces total_of_special_requests
##  Min.   :  -6.38   Min.   :0.00000             Min.   :0.0000           
##  1st Qu.:  69.29   1st Qu.:0.00000             1st Qu.:0.0000           
##  Median :  94.59   Median :0.00000             Median :0.0000           
##  Mean   : 101.83   Mean   :0.06252             Mean   :0.5713           
##  3rd Qu.: 126.00   3rd Qu.:0.00000             3rd Qu.:1.0000           
##  Max.   :5400.00   Max.   :8.00000             Max.   :5.0000           
##                                                                         
##  reservation_status reservation_status_date
##  Canceled :43013    Length:119386          
##  Check-Out:75166    Class :character       
##  No-Show  : 1207    Mode  :character       
##                                            
##                                            
##                                            
##

# Checking the first few rows of the dataset
head(subset_data)

##   required_car_parking_spaces lead_time adults children stays_in_weekend_nights
## 1                           0       342      2        0                       0
## 2                           0       737      2        0                       0
## 3                           0         7      1        0                       0
## 4                           0        13      1        0                       0
## 5                           0        14      2        0                       0
## 6                           0        14      2        0                       0
##   stays_in_week_nights
## 1                    0
## 2                    0
## 3                    1
## 4                    1
## 5                    2
## 6                    2

Model Building and Analysis

# Building a linear model with 'required_car_parking_spaces' as the response variable
model <- lm(required_car_parking_spaces ~ lead_time + adults + children + stays_in_weekend_nights + stays_in_week_nights, data = hotel_data)

Model Summary

summary(model)

## 
## Call:
## lm(formula = required_car_parking_spaces ~ lead_time + adults + 
##     children + stays_in_weekend_nights + stays_in_week_nights, 
##     data = hotel_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6388 -0.0801 -0.0667 -0.0367  7.9507 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              6.861e-02  2.493e-03  27.527  < 2e-16 ***
## lead_time               -2.666e-04  6.724e-06 -39.648  < 2e-16 ***
## adults                   1.211e-02  1.229e-03   9.846  < 2e-16 ***
## children                 3.184e-02  1.771e-03  17.982  < 2e-16 ***
## stays_in_weekend_nights -2.727e-03  8.147e-04  -3.347 0.000816 ***
## stays_in_week_nights    -6.439e-04  4.305e-04  -1.496 0.134752    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2432 on 119380 degrees of freedom
## Multiple R-squared:  0.01719,    Adjusted R-squared:  0.01715 
## F-statistic: 417.6 on 5 and 119380 DF,  p-value: < 2.2e-16

Diagnostic Tools

par(mfrow=c(2, 2))  # Create a 2x2 grid for diagnostic plots
plot(model)

Residual Analysis

# Here I am performing the Anderson-Darling test on residuals
library(nortest)

ad_test_result <- ad.test(model$residuals)
ad_test_result

## 
##  Anderson-Darling normality test
## 
## data:  model$residuals
## A = 31076, p-value < 2.2e-16

Kolmogorov-Smirnov Test

# Performing the Kolmogorov-Smirnov test on residuals
ks_test_result <- ks.test(model$residuals, "pnorm")

## Warning in ks.test.default(model$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test

ks_test_result

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  model$residuals
## D = 0.43751, p-value < 2.2e-16
## alternative hypothesis: two-sided

Highlighting Issues

# Highlighting Issues
if (ad_test_result$p.value < 0.05) {
  cat("The Anderson-Darling test suggests that the residuals may not follow a normal distribution.\n")
}

## The Anderson-Darling test suggests that the residuals may not follow a normal distribution.

if (ks_test_result$p.value < 0.05) {
  cat("The Kolmogorov-Smirnov test indicates potential departures from normality in the residuals.\n")
}

## The Kolmogorov-Smirnov test indicates potential departures from normality in the residuals.

Interpret a Coefficient

coefficient_to_interpret <- coef(model)['adults']
cat("For every additional adult staying in a room, the expected number of required parking spaces increases by ", round(coefficient_to_interpret, 2), " on average, holding other variables constant.\n")

## For every additional adult staying in a room, the expected number of required parking spaces increases by  0.01  on average, holding other variables constant.

# Performing the Kolmogorov-Smirnov test on residuals
ks.test(model$residuals, "pnorm")

## Warning in ks.test.default(model$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  model$residuals
## D = 0.43751, p-value < 2.2e-16
## alternative hypothesis: two-sided

Further Questions and Considerations

Though, diagnostic tests indicate potential deviations from normality, it’s important to note that model assumptions might be compromised. Consider further investigation and possible transformation of data.
The residual plots show heteroscedasticity, suggesting that the variance of residuals may vary with predictor variables. This may affect the reliability of the model.
Investigating influential outliers that might be contributing to the observed issues.
Exploring alternative modeling techniques that can handle non-normal residuals, such as robust regression or non-parametric models.

Hotel Booking Analysis

Dhruv Raghav

Date: November 06, 2023

Load the dataset

Load and Prepare the Data

Data Preparation and Exploration

Prepare Categorical Variables

Data Summary

Model Building and Analysis

Model Summary

Diagnostic Tools

Residual Analysis

Kolmogorov-Smirnov Test

Highlighting Issues

Interpret a Coefficient

Further Questions and Considerations