Load the dataset
Load and Prepare the Data
# Loading the dataset here
hotel_data <- read.csv("G:/semester_1/4_Statistics_R/syllabus/lab/week11/hotel_bookings.csv")
# Subset the data
subset_data <- hotel_data %>%
select(required_car_parking_spaces,
lead_time, adults, children, stays_in_weekend_nights, stays_in_week_nights)
Data Preparation and Exploration
# Checking the missing values
missing_values <- sapply(hotel_data, function(x) sum(is.na(x)))
# Removing rows with missing values
hotel_data <- hotel_data %>% filter(!apply(is.na(.), 1, any))
Prepare Categorical Variables
categorical_vars <- c("hotel", "meal", "country", "market_segment", "distribution_channel", "reserved_room_type", "assigned_room_type", "deposit_type", "customer_type", "reservation_status")
hotel_data[categorical_vars] <- lapply(hotel_data[categorical_vars], as.factor)
# Encoding the date variables
hotel_data$arrival_date_month <- factor(hotel_data$arrival_date_month, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))
Data Summary
# Data summary
summary(hotel_data)
## hotel is_canceled lead_time arrival_date_year
## City Hotel :79326 Min. :0.0000 Min. : 0 Min. :2015
## Resort Hotel:40060 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## August :13873 Min. : 1.00 Min. : 1.0
## July :12661 1st Qu.:16.00 1st Qu.: 8.0
## May :11791 Median :28.00 Median :16.0
## October:11160 Mean :27.16 Mean :15.8
## April :11089 3rd Qu.:38.00 3rd Qu.:23.0
## June :10939 Max. :53.00 Max. :31.0
## (Other):47873
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 BB :92306 PRT :48586
## 1st Qu.: 0.0000 1st Qu.: 0.000000 FB : 798 GBR :12129
## Median : 0.0000 Median : 0.000000 HB :14463 FRA :10415
## Mean : 0.1039 Mean : 0.007949 SC :10650 ESP : 8568
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000 Undefined: 1169 DEU : 7287
## Max. :10.0000 Max. :10.000000 ITA : 3766
## (Other):28635
## market_segment distribution_channel is_repeated_guest
## Aviation : 237 Corporate: 6677 Min. :0.00000
## Complementary: 743 Direct :14645 1st Qu.:0.00000
## Corporate : 5295 GDS : 193 Median :0.00000
## Direct :12605 TA/TO :97870 Mean :0.03191
## Groups :19811 Undefined: 1 3rd Qu.:0.00000
## Offline TA/TO:24219 Max. :1.00000
## Online TA :56476
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 A :85994
## 1st Qu.: 0.00000 1st Qu.: 0.0000 D :19201
## Median : 0.00000 Median : 0.0000 E : 6535
## Mean : 0.08712 Mean : 0.1371 F : 2897
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000 G : 2094
## Max. :26.00000 Max. :72.0000 B : 1114
## (Other): 1551
## assigned_room_type booking_changes deposit_type agent
## A :74053 Min. : 0.0000 No Deposit:104637 Length:119386
## D :25322 1st Qu.: 0.0000 Non Refund: 14587 Class :character
## E : 7806 Median : 0.0000 Refundable: 162 Mode :character
## F : 3751 Mean : 0.2211
## G : 2553 3rd Qu.: 0.0000
## C : 2375 Max. :21.0000
## (Other): 3526
## company days_in_waiting_list customer_type
## Length:119386 Min. : 0.000 Contract : 4076
## Class :character 1st Qu.: 0.000 Group : 577
## Mode :character Median : 0.000 Transient :89613
## Mean : 2.321 Transient-Party:25120
## 3rd Qu.: 0.000
## Max. :391.000
##
## adr required_car_parking_spaces total_of_special_requests
## Min. : -6.38 Min. :0.00000 Min. :0.0000
## 1st Qu.: 69.29 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 94.59 Median :0.00000 Median :0.0000
## Mean : 101.83 Mean :0.06252 Mean :0.5713
## 3rd Qu.: 126.00 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :5400.00 Max. :8.00000 Max. :5.0000
##
## reservation_status reservation_status_date
## Canceled :43013 Length:119386
## Check-Out:75166 Class :character
## No-Show : 1207 Mode :character
##
##
##
##
# Checking the first few rows of the dataset
head(subset_data)
## required_car_parking_spaces lead_time adults children stays_in_weekend_nights
## 1 0 342 2 0 0
## 2 0 737 2 0 0
## 3 0 7 1 0 0
## 4 0 13 1 0 0
## 5 0 14 2 0 0
## 6 0 14 2 0 0
## stays_in_week_nights
## 1 0
## 2 0
## 3 1
## 4 1
## 5 2
## 6 2
Model Building and Analysis
# Building a linear model with 'required_car_parking_spaces' as the response variable
model <- lm(required_car_parking_spaces ~ lead_time + adults + children + stays_in_weekend_nights + stays_in_week_nights, data = hotel_data)
Model Summary
summary(model)
##
## Call:
## lm(formula = required_car_parking_spaces ~ lead_time + adults +
## children + stays_in_weekend_nights + stays_in_week_nights,
## data = hotel_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6388 -0.0801 -0.0667 -0.0367 7.9507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.861e-02 2.493e-03 27.527 < 2e-16 ***
## lead_time -2.666e-04 6.724e-06 -39.648 < 2e-16 ***
## adults 1.211e-02 1.229e-03 9.846 < 2e-16 ***
## children 3.184e-02 1.771e-03 17.982 < 2e-16 ***
## stays_in_weekend_nights -2.727e-03 8.147e-04 -3.347 0.000816 ***
## stays_in_week_nights -6.439e-04 4.305e-04 -1.496 0.134752
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2432 on 119380 degrees of freedom
## Multiple R-squared: 0.01719, Adjusted R-squared: 0.01715
## F-statistic: 417.6 on 5 and 119380 DF, p-value: < 2.2e-16
Residual Analysis
# Here I am performing the Anderson-Darling test on residuals
library(nortest)
ad_test_result <- ad.test(model$residuals)
ad_test_result
##
## Anderson-Darling normality test
##
## data: model$residuals
## A = 31076, p-value < 2.2e-16
Kolmogorov-Smirnov Test
# Performing the Kolmogorov-Smirnov test on residuals
ks_test_result <- ks.test(model$residuals, "pnorm")
## Warning in ks.test.default(model$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test
ks_test_result
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: model$residuals
## D = 0.43751, p-value < 2.2e-16
## alternative hypothesis: two-sided
Highlighting Issues
# Highlighting Issues
if (ad_test_result$p.value < 0.05) {
cat("The Anderson-Darling test suggests that the residuals may not follow a normal distribution.\n")
}
## The Anderson-Darling test suggests that the residuals may not follow a normal distribution.
if (ks_test_result$p.value < 0.05) {
cat("The Kolmogorov-Smirnov test indicates potential departures from normality in the residuals.\n")
}
## The Kolmogorov-Smirnov test indicates potential departures from normality in the residuals.
Interpret a Coefficient
coefficient_to_interpret <- coef(model)['adults']
cat("For every additional adult staying in a room, the expected number of required parking spaces increases by ", round(coefficient_to_interpret, 2), " on average, holding other variables constant.\n")
## For every additional adult staying in a room, the expected number of required parking spaces increases by 0.01 on average, holding other variables constant.
# Performing the Kolmogorov-Smirnov test on residuals
ks.test(model$residuals, "pnorm")
## Warning in ks.test.default(model$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: model$residuals
## D = 0.43751, p-value < 2.2e-16
## alternative hypothesis: two-sided
Further Questions and Considerations
- Though, diagnostic tests indicate potential deviations from
normality, it’s important to note that model assumptions might be
compromised. Consider further investigation and possible transformation
of data.
- The residual plots show heteroscedasticity, suggesting that the
variance of residuals may vary with predictor variables. This may affect
the reliability of the model.
- Investigating influential outliers that might be contributing to the
observed issues.
- Exploring alternative modeling techniques that can handle non-normal
residuals, such as robust regression or non-parametric models.