knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(kknn)
library(dplyr)
library(tidyr)
library(stringr)
library(tidyverse)
library(corrplot)
library(DescTools)
#import data
hotels_data <- read.csv("hotels.csv")
#Review variables - 119390 obs of 32 variables
str(hotels_data)
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr "July" "July" "July" "July" ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr "BB" "BB" "BB" "BB" ...
## $ country : chr "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr "C" "C" "A" "A" ...
## $ assigned_room_type : chr "C" "C" "C" "A" ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr "NULL" "NULL" "NULL" "304" ...
## $ company : chr "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
# Checking for values that don't make sense
summary(hotels_data)
## hotel is_canceled lead_time arrival_date_year
## Length:119390 Min. :0.0000 Min. : 0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:119390 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.:16.00 1st Qu.: 8.0
## Mode :character Median :28.00 Median :16.0
## Mean :27.17 Mean :15.8
## 3rd Qu.:38.00 3rd Qu.:23.0
## Max. :53.00 Max. :31.0
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 Length:119390 Length:119390
## 1st Qu.: 0.0000 1st Qu.: 0.000000 Class :character Class :character
## Median : 0.0000 Median : 0.000000 Mode :character Mode :character
## Mean : 0.1039 Mean : 0.007949
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000
## Max. :10.0000 Max. :10.000000
## NA's :4
## market_segment distribution_channel is_repeated_guest
## Length:119390 Length:119390 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000
## Mean :0.03191
## 3rd Qu.:0.00000
## Max. :1.00000
##
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 Length:119390
## 1st Qu.: 0.00000 1st Qu.: 0.0000 Class :character
## Median : 0.00000 Median : 0.0000 Mode :character
## Mean : 0.08712 Mean : 0.1371
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :26.00000 Max. :72.0000
##
## assigned_room_type booking_changes deposit_type agent
## Length:119390 Min. : 0.0000 Length:119390 Length:119390
## Class :character 1st Qu.: 0.0000 Class :character Class :character
## Mode :character Median : 0.0000 Mode :character Mode :character
## Mean : 0.2211
## 3rd Qu.: 0.0000
## Max. :21.0000
##
## company days_in_waiting_list customer_type adr
## Length:119390 Min. : 0.000 Length:119390 Min. : -6.38
## Class :character 1st Qu.: 0.000 Class :character 1st Qu.: 69.29
## Mode :character Median : 0.000 Mode :character Median : 94.58
## Mean : 2.321 Mean : 101.83
## 3rd Qu.: 0.000 3rd Qu.: 126.00
## Max. :391.000 Max. :5400.00
##
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:119390
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :0.0000 Mode :character
## Mean :0.06252 Mean :0.5714
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :8.00000 Max. :5.0000
##
## reservation_status_date
## Length:119390
## Class :character
## Mode :character
##
##
##
##
# Find all numeric variables
numeric_data <- hotels_data[sapply(hotels_data, is.numeric)]
numeric_data_melted <- reshape2::melt(numeric_data)
## No id variables; using all as measure variables
# Create a boxplot for numeric variables
ggplot(numeric_data_melted, aes(y=value)) +
geom_boxplot() +
facet_wrap(~variable, scales = 'free', ncol=2) +
labs(x='', y='')
## Warning: Removed 4 rows containing non-finite values (`stat_boxplot()`).
adrSetting the outlier 5400 to the mean of adr
summary(hotels_data$adr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.38 69.29 94.58 101.83 126.00 5400.00
hotels_data[hotels_data$adr > 500, ] <- mean(hotels_data$adr)
hotels_data[hotels_data$adr > 500, ]
## [1] hotel is_canceled
## [3] lead_time arrival_date_year
## [5] arrival_date_month arrival_date_week_number
## [7] arrival_date_day_of_month stays_in_weekend_nights
## [9] stays_in_week_nights adults
## [11] children babies
## [13] meal country
## [15] market_segment distribution_channel
## [17] is_repeated_guest previous_cancellations
## [19] previous_bookings_not_canceled reserved_room_type
## [21] assigned_room_type booking_changes
## [23] deposit_type agent
## [25] company days_in_waiting_list
## [27] customer_type adr
## [29] required_car_parking_spaces total_of_special_requests
## [31] reservation_status reservation_status_date
## <0 rows> (or 0-length row.names)
arrival_date_yearFound arrival year date min = 0 leading to all var in these obs as 0. should remove
summary(hotels_data$arrival_date_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 101.8 2016.0 2016.0 2016.1 2017.0 2017.0
unique(hotels_data$arrival_date_year)
## [1] 2015.0000 2016.0000 2017.0000 101.8311
hotels_data[hotels_data$arrival_date_year == 0, ]
## [1] hotel is_canceled
## [3] lead_time arrival_date_year
## [5] arrival_date_month arrival_date_week_number
## [7] arrival_date_day_of_month stays_in_weekend_nights
## [9] stays_in_week_nights adults
## [11] children babies
## [13] meal country
## [15] market_segment distribution_channel
## [17] is_repeated_guest previous_cancellations
## [19] previous_bookings_not_canceled reserved_room_type
## [21] assigned_room_type booking_changes
## [23] deposit_type agent
## [25] company days_in_waiting_list
## [27] customer_type adr
## [29] required_car_parking_spaces total_of_special_requests
## [31] reservation_status reservation_status_date
## <0 rows> (or 0-length row.names)
# Keep the rows where arrival_date_year is not 0
hotels_data <- hotels_data[hotels_data$arrival_date_year != 0, ]
# Verification step
summary(hotels_data$arrival_date_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 101.8 2016.0 2016.0 2016.1 2017.0 2017.0
unique(hotels_data$arrival_date_year)
## [1] 2015.0000 2016.0000 2017.0000 101.8311
adultssummary(hotels_data$adults)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 2.000 1.859 2.000 101.831
unique(hotels_data$adults)
## [1] 2.0000 1.0000 3.0000 4.0000 40.0000 26.0000 50.0000 27.0000
## [9] 55.0000 0.0000 20.0000 6.0000 5.0000 10.0000 101.8311
#Adults summary values seems off with a max number of 55
unique(hotels_data$adults)
## [1] 2.0000 1.0000 3.0000 4.0000 40.0000 26.0000 50.0000 27.0000
## [9] 55.0000 0.0000 20.0000 6.0000 5.0000 10.0000 101.8311
# All of the obs with adults > 4 is cancelled. These obs will be removed assuming it was a typo error then cancellation to correct
head(hotels_data[hotels_data$adults > 4, "reservation_status"])
## [1] "Canceled" "Canceled" "Canceled" "Canceled" "Canceled" "Canceled"
# Believed the 0 adults counts could be an error, but the reservation status shows check-out and the other variables work so it appears to be an actual booking just missing the guest count
summary(hotels_data[hotels_data$adults == 0,])
## hotel is_canceled lead_time arrival_date_year
## Length:403 Min. :0.0000 Min. : 0.00 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 1.00 1st Qu.:2016
## Mode :character Median :0.0000 Median : 44.00 Median :2016
## Mean :0.2705 Mean : 86.56 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:153.50 3rd Qu.:2017
## Max. :1.0000 Max. :434.00 Max. :2017
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:403 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.:13.00 1st Qu.: 8.50
## Mode :character Median :28.00 Median :16.00
## Mean :27.47 Mean :16.23
## 3rd Qu.:40.50 3rd Qu.:24.00
## Max. :53.00 Max. :31.00
## stays_in_weekend_nights stays_in_week_nights adults children
## Min. : 0.000 Min. : 0.000 Min. :0 Min. :0.000
## 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.:0 1st Qu.:0.000
## Median : 1.000 Median : 2.000 Median :0 Median :2.000
## Mean : 1.119 Mean : 2.871 Mean :0 Mean :1.124
## 3rd Qu.: 2.000 3rd Qu.: 4.000 3rd Qu.:0 3rd Qu.:2.000
## Max. :16.000 Max. :41.000 Max. :0 Max. :3.000
## babies meal country market_segment
## Min. :0.000000 Length:403 Length:403 Length:403
## 1st Qu.:0.000000 Class :character Class :character Class :character
## Median :0.000000 Mode :character Mode :character Mode :character
## Mean :0.007444
## 3rd Qu.:0.000000
## Max. :1.000000
## distribution_channel is_repeated_guest previous_cancellations
## Length:403 Min. :0.000 Min. :0.00000
## Class :character 1st Qu.:0.000 1st Qu.:0.00000
## Mode :character Median :0.000 Median :0.00000
## Mean :0.139 Mean :0.02233
## 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :1.000 Max. :2.00000
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## Min. :0.00000 Length:403 Length:403
## 1st Qu.:0.00000 Class :character Class :character
## Median :0.00000 Mode :character Mode :character
## Mean :0.06452
## 3rd Qu.:0.00000
## Max. :9.00000
## booking_changes deposit_type agent company
## Min. : 0.0000 Length:403 Length:403 Length:403
## 1st Qu.: 0.0000 Class :character Class :character Class :character
## Median : 0.0000 Mode :character Mode :character Mode :character
## Mean : 0.9876
## 3rd Qu.: 1.0000
## Max. :21.0000
## days_in_waiting_list customer_type adr
## Min. : 0.000 Length:403 Min. : 0.00
## 1st Qu.: 0.000 Class :character 1st Qu.: 0.00
## Median : 0.000 Mode :character Median : 56.27
## Mean : 1.017 Mean : 49.58
## 3rd Qu.: 0.000 3rd Qu.: 91.85
## Max. :122.000 Max. :200.00
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:403
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :1.0000 Mode :character
## Mean :0.02233 Mean :0.7841
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :4.0000
## reservation_status_date
## Length:403
## Class :character
## Mode :character
##
##
##
hotels_data <- hotels_data[hotels_data$adults < 5, ]
# Verification step
summary(hotels_data$adults)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 2.000 1.853 2.000 4.000
unique(hotels_data$adults)
## [1] 2 1 3 4 0
babiessummary(hotels_data$babies)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.00795 0.00000 10.00000
unique(hotels_data$babies)
## [1] 0 1 2 10 9
# Babies var seems off. There are 9 and 10 counts. This appears to be a one-off error. changing value to 0
unique(hotels_data$babies)
## [1] 0 1 2 10 9
hotels_data[hotels_data$babies == 10, ] <- 0
hotels_data[hotels_data$babies == 9, ] <- 0
unique(hotels_data$babies)
## [1] 0 1 2
# Verification step
summary(hotels_data$babies)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.007791 0.000000 2.000000
unique(hotels_data$babies)
## [1] 0 1 2
Found 4 NA in children and changed them to 0.
# checking for NA
colSums(is.na(hotels_data))
## hotel is_canceled
## 0 0
## lead_time arrival_date_year
## 0 0
## arrival_date_month arrival_date_week_number
## 0 0
## arrival_date_day_of_month stays_in_weekend_nights
## 0 0
## stays_in_week_nights adults
## 0 0
## children babies
## 4 0
## meal country
## 0 0
## market_segment distribution_channel
## 0 0
## is_repeated_guest previous_cancellations
## 0 0
## previous_bookings_not_canceled reserved_room_type
## 0 0
## assigned_room_type booking_changes
## 0 0
## deposit_type agent
## 0 0
## company days_in_waiting_list
## 0 0
## customer_type adr
## 0 0
## required_car_parking_spaces total_of_special_requests
## 0 0
## reservation_status reservation_status_date
## 0 0
# 4 NA found in children. We will assume NA means 0 children
hotels_data[is.na(hotels_data$children),] <- 0
colSums(is.na(hotels_data))
## hotel is_canceled
## 0 0
## lead_time arrival_date_year
## 0 0
## arrival_date_month arrival_date_week_number
## 0 0
## arrival_date_day_of_month stays_in_weekend_nights
## 0 0
## stays_in_week_nights adults
## 0 0
## children babies
## 0 0
## meal country
## 0 0
## market_segment distribution_channel
## 0 0
## is_repeated_guest previous_cancellations
## 0 0
## previous_bookings_not_canceled reserved_room_type
## 0 0
## assigned_room_type booking_changes
## 0 0
## deposit_type agent
## 0 0
## company days_in_waiting_list
## 0 0
## customer_type adr
## 0 0
## required_car_parking_spaces total_of_special_requests
## 0 0
## reservation_status reservation_status_date
## 0 0
corr_matrix <- cor(hotels_data[sapply(hotels_data, is.numeric)])
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
tl.col="black", tl.srt=45)
# Set the seed
set.seed(21)
# Randomly sample row indices for the training set
train_indices <- sample(1:NROW(hotels_data),NROW(hotels_data)*0.75)
# Create the training set
train_data <- hotels_data[train_indices, ]
numeric_train_data <- train_data[sapply(train_data, is.numeric)]
# Create the testing set
test_data <- hotels_data[-train_indices, ]
numeric_test_data <- test_data[sapply(test_data, is.numeric)]
# Winsorize all numeric variables in the dataframe
winsorize_all <- function(data, lower = 0.05, upper = 0.95) {
data <- lapply(data, function(x) Winsorize(x, probs = c(lower, upper)))
return(data)
}
adr - Average Daily Ratehotels_train_data_winsorized <- winsorize_all(numeric_train_data)
summary(hotels_train_data_winsorized)
## Length Class Mode
## is_canceled 89528 -none- numeric
## lead_time 89528 -none- numeric
## arrival_date_year 89528 -none- numeric
## arrival_date_week_number 89528 -none- numeric
## arrival_date_day_of_month 89528 -none- numeric
## stays_in_weekend_nights 89528 -none- numeric
## stays_in_week_nights 89528 -none- numeric
## adults 89528 -none- numeric
## children 89528 -none- numeric
## babies 89528 -none- numeric
## is_repeated_guest 89528 -none- numeric
## previous_cancellations 89528 -none- numeric
## previous_bookings_not_canceled 89528 -none- numeric
## booking_changes 89528 -none- numeric
## days_in_waiting_list 89528 -none- numeric
## adr 89528 -none- numeric
## required_car_parking_spaces 89528 -none- numeric
## total_of_special_requests 89528 -none- numeric
lm_model_train <- lm(adr ~ ., data = numeric_train_data)
summary(lm_model_train)
##
## Call:
## lm(formula = adr ~ ., data = numeric_train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -203.02 -27.48 -3.88 22.97 357.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.950337 18.641087 -3.323 0.000890 ***
## is_canceled 10.166041 0.318145 31.954 < 2e-16 ***
## lead_time -0.055063 0.001440 -38.251 < 2e-16 ***
## arrival_date_year 0.049471 0.009250 5.348 8.90e-08 ***
## arrival_date_week_number 0.291615 0.010383 28.086 < 2e-16 ***
## arrival_date_day_of_month 0.104052 0.015933 6.531 6.58e-11 ***
## stays_in_weekend_nights -0.551033 0.162074 -3.400 0.000674 ***
## stays_in_week_nights 0.792110 0.085528 9.261 < 2e-16 ***
## adults 25.473679 0.298397 85.368 < 2e-16 ***
## children 37.614816 0.353622 106.370 < 2e-16 ***
## babies 0.048070 1.572429 0.031 0.975612
## is_repeated_guest -24.643135 0.886712 -27.792 < 2e-16 ***
## previous_cancellations -2.717740 0.168025 -16.175 < 2e-16 ***
## previous_bookings_not_canceled 0.074429 0.100389 0.741 0.458451
## booking_changes 1.750363 0.219326 7.981 1.47e-15 ***
## days_in_waiting_list -0.025282 0.007888 -3.205 0.001350 **
## required_car_parking_spaces 7.397572 0.587955 12.582 < 2e-16 ***
## total_of_special_requests 7.148434 0.186357 38.359 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41.71 on 89510 degrees of freedom
## Multiple R-squared: 0.2474, Adjusted R-squared: 0.2473
## F-statistic: 1731 on 17 and 89510 DF, p-value: < 2.2e-16
lm_model_train_win <- lm(adr ~ ., data = hotels_train_data_winsorized)
summary(lm_model_train_win)
##
## Call:
## lm(formula = adr ~ ., data = hotels_train_data_winsorized)
##
## Residuals:
## Min 1Q Median 3Q Max
## -134.293 -24.006 -3.275 22.339 137.027
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.621e+04 4.280e+02 -84.613 < 2e-16 ***
## is_canceled 9.284e+00 2.759e-01 33.651 < 2e-16 ***
## lead_time -6.720e-02 1.375e-03 -48.881 < 2e-16 ***
## arrival_date_year 1.798e+01 2.122e-01 84.711 < 2e-16 ***
## arrival_date_week_number 8.119e-01 1.092e-02 74.334 < 2e-16 ***
## arrival_date_day_of_month 5.217e-02 1.351e-02 3.861 0.000113 ***
## stays_in_weekend_nights 1.827e-03 1.442e-01 0.013 0.989893
## stays_in_week_nights 1.281e+00 8.738e-02 14.658 < 2e-16 ***
## adults 2.191e+01 2.563e-01 85.472 < 2e-16 ***
## children 3.958e+01 4.615e-01 85.771 < 2e-16 ***
## babies NA NA NA NA
## is_repeated_guest NA NA NA NA
## previous_cancellations -5.267e+00 5.756e-01 -9.149 < 2e-16 ***
## previous_bookings_not_canceled NA NA NA NA
## booking_changes 2.637e+00 3.369e-01 7.827 5.04e-15 ***
## days_in_waiting_list NA NA NA NA
## required_car_parking_spaces 5.265e+00 5.000e-01 10.530 < 2e-16 ***
## total_of_special_requests 5.031e+00 1.751e-01 28.728 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.08 on 89514 degrees of freedom
## Multiple R-squared: 0.2795, Adjusted R-squared: 0.2794
## F-statistic: 2671 on 13 and 89514 DF, p-value: < 2.2e-16
comparison_df <- data.frame(Actual = numeric_train_data$adr, lm_predicted = lm_model_train$fitted.values)
head(comparison_df)
## Actual lm_predicted
## 16562 81.0 96.45532
## 12105 80.1 103.70450
## 76839 89.0 104.74265
## 55685 115.0 96.35001
## 106096 105.0 90.61195
## 68024 130.0 97.38073
comparison_df <- data.frame(Actual = hotels_train_data_winsorized$adr, lm_predicted = lm_model_train_win$fitted.values)
head(comparison_df)
## Actual lm_predicted
## 1 81.0 78.39981
## 2 80.1 114.89974
## 3 89.0 87.32924
## 4 115.0 93.52211
## 5 105.0 96.42189
## 6 130.0 107.44216
lm_mse_train <- mean((lm_model_train$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Training MSE for Linear Model: 1739.62"
lm_mse_train_win <- mean((lm_model_train_win$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Winsorized Linear Model:", round(lm_mse_train_win, 2)))
## [1] "Training MSE for Winsorized Linear Model: 1671.39"
There do appear to be outliers. Applying winzorisation did lower the training and testing MSE for the linear regression models.
hotels_test_data_winsorized <- winsorize_all(numeric_test_data)
summary(hotels_test_data_winsorized)
## Length Class Mode
## is_canceled 29843 -none- numeric
## lead_time 29843 -none- numeric
## arrival_date_year 29843 -none- numeric
## arrival_date_week_number 29843 -none- numeric
## arrival_date_day_of_month 29843 -none- numeric
## stays_in_weekend_nights 29843 -none- numeric
## stays_in_week_nights 29843 -none- numeric
## adults 29843 -none- numeric
## children 29843 -none- numeric
## babies 29843 -none- numeric
## is_repeated_guest 29843 -none- numeric
## previous_cancellations 29843 -none- numeric
## previous_bookings_not_canceled 29843 -none- numeric
## booking_changes 29843 -none- numeric
## days_in_waiting_list 29843 -none- numeric
## adr 29843 -none- numeric
## required_car_parking_spaces 29843 -none- numeric
## total_of_special_requests 29843 -none- numeric
lm_model_test <- lm(adr ~ ., data = numeric_test_data)
summary(lm_model_test)
##
## Call:
## lm(formula = adr ~ ., data = numeric_test_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -344.76 -27.54 -3.82 22.96 280.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.054e+02 4.195e+01 -2.512 0.012005 *
## is_canceled 1.032e+01 5.584e-01 18.479 < 2e-16 ***
## lead_time -5.320e-02 2.484e-03 -21.421 < 2e-16 ***
## arrival_date_year 7.043e-02 2.081e-02 3.384 0.000715 ***
## arrival_date_week_number 2.822e-01 1.806e-02 15.628 < 2e-16 ***
## arrival_date_day_of_month 1.065e-01 2.781e-02 3.828 0.000129 ***
## stays_in_weekend_nights -1.015e+00 2.821e-01 -3.597 0.000323 ***
## stays_in_week_nights 9.450e-01 1.508e-01 6.266 3.77e-10 ***
## adults 2.600e+01 5.225e-01 49.759 < 2e-16 ***
## children 3.567e+01 6.136e-01 58.127 < 2e-16 ***
## babies 1.848e+00 2.771e+00 0.667 0.504813
## is_repeated_guest -2.449e+01 1.571e+00 -15.590 < 2e-16 ***
## previous_cancellations -2.769e+00 3.005e-01 -9.215 < 2e-16 ***
## previous_bookings_not_canceled -1.982e-02 2.034e-01 -0.097 0.922406
## booking_changes 1.420e+00 3.801e-01 3.736 0.000187 ***
## days_in_waiting_list -1.374e-02 1.529e-02 -0.898 0.369058
## required_car_parking_spaces 7.207e+00 1.001e+00 7.197 6.29e-13 ***
## total_of_special_requests 7.653e+00 3.256e-01 23.504 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42 on 29825 degrees of freedom
## Multiple R-squared: 0.2421, Adjusted R-squared: 0.2416
## F-statistic: 560.3 on 17 and 29825 DF, p-value: < 2.2e-16
lm_model_test_win <- lm(adr ~ ., data = hotels_test_data_winsorized)
summary(lm_model_test_win)
##
## Call:
## lm(formula = adr ~ ., data = hotels_test_data_winsorized)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133.368 -24.168 -3.051 22.421 124.130
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.732e+04 7.429e+02 -50.240 < 2e-16 ***
## is_canceled 9.469e+00 4.818e-01 19.652 < 2e-16 ***
## lead_time -6.696e-02 2.373e-03 -28.213 < 2e-16 ***
## arrival_date_year 1.853e+01 3.684e-01 50.295 < 2e-16 ***
## arrival_date_week_number 8.299e-01 1.900e-02 43.684 < 2e-16 ***
## arrival_date_day_of_month 5.633e-02 2.347e-02 2.400 0.0164 *
## stays_in_weekend_nights -6.319e-01 2.504e-01 -2.524 0.0116 *
## stays_in_week_nights 1.333e+00 1.519e-01 8.776 < 2e-16 ***
## adults 2.257e+01 4.471e-01 50.481 < 2e-16 ***
## children 3.800e+01 7.975e-01 47.652 < 2e-16 ***
## babies NA NA NA NA
## is_repeated_guest NA NA NA NA
## previous_cancellations -5.639e+00 1.019e+00 -5.535 3.13e-08 ***
## previous_bookings_not_canceled NA NA NA NA
## booking_changes 2.786e+00 5.844e-01 4.767 1.88e-06 ***
## days_in_waiting_list NA NA NA NA
## required_car_parking_spaces 4.146e+00 8.605e-01 4.818 1.46e-06 ***
## total_of_special_requests 5.417e+00 3.037e-01 17.835 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.17 on 29829 degrees of freedom
## Multiple R-squared: 0.2814, Adjusted R-squared: 0.281
## F-statistic: 898.3 on 13 and 29829 DF, p-value: < 2.2e-16
comparison_df <- data.frame(Actual = numeric_test_data$adr, lm_predicted = lm_model_test$fitted.values)
head(comparison_df)
## Actual lm_predicted
## 8 103.00 105.31938
## 10 105.50 105.41917
## 23 84.67 103.24816
## 33 108.30 122.30285
## 37 98.00 97.38399
## 39 108.80 132.21362
comparison_df_win <- data.frame(Actual = hotels_test_data_winsorized$adr, lm_predicted = lm_model_test_win$fitted.values)
head(comparison_df)
## Actual lm_predicted
## 8 103.00 105.31938
## 10 105.50 105.41917
## 23 84.67 103.24816
## 33 108.30 122.30285
## 37 98.00 97.38399
## 39 108.80 132.21362
lm_mse_train <- mean((lm_model_train$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Training MSE for Linear Model: 1739.62"
lm_mse_train_win <- mean((lm_model_train_win$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Winsorized Linear Model:", round(lm_mse_train_win, 2)))
## [1] "Training MSE for Winsorized Linear Model: 1671.39"
lm_mse_test <- mean((lm_model_test$fitted.values - numeric_test_data$adr)^2)
print(paste("Test MSE for Linear Model:", round(lm_mse_test, 2)))
## [1] "Test MSE for Linear Model: 1763.21"
lm_mse_test_win <- mean((lm_model_test_win$fitted.values - numeric_test_data$adr)^2)
print(paste("Test MSE for Winsorized Linear Model:", round(lm_mse_test_win, 2)))
## [1] "Test MSE for Winsorized Linear Model: 1679.17"
We are assuming the relationship between the two variables is linear, the variance is equal across the regression line, and normality. The variance is large, but appears to be equal across the variables. JHowever, normality does not appear to be present.
ggplot(numeric_train_data, aes(x = adults, y = adr)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Linear Regression Average Daily Rate ~ Adults")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(numeric_train_data, aes(x = children, y = adr)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Linear Regression Average Daily Rate ~ Children")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(numeric_train_data, aes(x = is_repeated_guest, y = adr)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Linear Regression Average Daily Rate ~ Is Repeated Guest")
## `geom_smooth()` using formula = 'y ~ x'
Adding the interaction between adults and is_repeated_guest increased the R^2 from .09295 to .1015 showing a correlation.
lm_model_test <- lm(adr ~ adults, data = numeric_test_data)
summary(lm_model_test)
##
## Call:
## lm(formula = adr ~ adults, data = numeric_test_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -136.19 -31.10 -6.19 23.00 330.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.9101 1.0436 43.99 <2e-16 ***
## adults 30.0942 0.5442 55.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.94 on 29841 degrees of freedom
## Multiple R-squared: 0.09295, Adjusted R-squared: 0.09292
## F-statistic: 3058 on 1 and 29841 DF, p-value: < 2.2e-16
lm_model_test <- lm(adr ~ adults + adults*is_repeated_guest, data = numeric_test_data)
summary(lm_model_test)
##
## Call:
## lm(formula = adr ~ adults + adults * is_repeated_guest, data = numeric_test_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -135.35 -31.71 -6.71 22.74 330.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.4140 1.0857 45.514 < 2e-16 ***
## adults 28.6465 0.5624 50.934 < 2e-16 ***
## is_repeated_guest -19.4695 4.1174 -4.729 2.27e-06 ***
## adults:is_repeated_guest -4.2869 2.7291 -1.571 0.116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.72 on 29839 degrees of freedom
## Multiple R-squared: 0.1015, Adjusted R-squared: 0.1014
## F-statistic: 1123 on 3 and 29839 DF, p-value: < 2.2e-16
lm_model_train_win <- lm(adr ~ adults + is_repeated_guest + adults * is_repeated_guest, data = hotels_train_data_winsorized)
summary(lm_model_train_win)
##
## Call:
## lm(formula = adr ~ adults + is_repeated_guest + adults * is_repeated_guest,
## data = hotels_train_data_winsorized)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.686 -29.789 -5.589 24.359 114.679
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.0536 0.5289 100.32 <2e-16 ***
## adults 25.7674 0.2760 93.37 <2e-16 ***
## is_repeated_guest NA NA NA NA
## adults:is_repeated_guest NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.45 on 89526 degrees of freedom
## Multiple R-squared: 0.08873, Adjusted R-squared: 0.08872
## F-statistic: 8717 on 1 and 89526 DF, p-value: < 2.2e-16
adr - Average Daily Rate knn_train_model_win <- kknn(adr ~ ., train = hotels_train_data_winsorized, test = hotels_train_data_winsorized, k = 14)
knn_train_pred <- fitted.values(knn_train_model_win)
# Calculate in-sample MSE manually
knn_train_mse <- mean((hotels_train_data_winsorized$adr - knn_train_pred)^2)
print(paste("In-Sample MSE for KNN: ", knn_train_mse))
## [1] "In-Sample MSE for KNN: 424.353832972877"
Using standardized scaling will make the KNN comparisons much easier to work since the variances are on the same scale.
# mse_df <- data.frame(k = integer(), MSE_train = numeric(), MSE_test = numeric())
#
# for (k in c(9, 11, 12, 13, 14, 15)) {
#
# # Fit the k-NN model using training data
# knn_model_train <- kknn(adr ~ ., train = hotels_train_data_winsorized, test = hotels_train_data_winsorized, k = k)
#
# # Calculate the training MSE
# mse_train <- mean((knn_model_train$fitted.values - hotels_train_data_winsorized$adr)^2)
#
#
# # Test the k-NN with testing data
# knn_model_test <- kknn(adr ~ ., train = hotels_train_data_winsorized, test = hotels_test_data_winsorized, k = k)
# mse_test <- mean((knn_model_test$fitted.values - hotels_test_data_winsorized$adr)^2)
#
# # save the results
# mse_df <- rbind(mse_df, data.frame(k = k, MSE_train = mse_train, MSE_test = mse_test))
#
# }
#
# # Show the MSE dataframe
# print(mse_df)
I ran the above code to find the most viable k # based on the training/testing MSE. It appears k = 14 would be the best fit to maximize flexibility, but also avoid overfitting to the training set.
The KNN model takes a very long time to complete on such a large dataset. Linear regression based on the findings makes more sense since it is so easy to overfit the KNN models.