Setup & Load Libraries

knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(kknn)
library(dplyr)
library(tidyr)
library(stringr)
library(tidyverse)
library(corrplot)
library(DescTools)

Data Exploration

#import data
hotels_data <- read.csv("hotels.csv")

#Review variables - 119390 obs of 32 variables
str(hotels_data)
## 'data.frame':    119390 obs. of  32 variables:
##  $ hotel                         : chr  "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
##  $ is_canceled                   : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ lead_time                     : int  342 737 7 13 14 14 0 9 85 75 ...
##  $ arrival_date_year             : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ arrival_date_month            : chr  "July" "July" "July" "July" ...
##  $ arrival_date_week_number      : int  27 27 27 27 27 27 27 27 27 27 ...
##  $ arrival_date_day_of_month     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : int  0 0 1 1 2 2 2 2 3 3 ...
##  $ adults                        : int  2 2 1 1 2 2 2 2 2 2 ...
##  $ children                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ babies                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ meal                          : chr  "BB" "BB" "BB" "BB" ...
##  $ country                       : chr  "PRT" "PRT" "GBR" "GBR" ...
##  $ market_segment                : chr  "Direct" "Direct" "Direct" "Corporate" ...
##  $ distribution_channel          : chr  "Direct" "Direct" "Direct" "Corporate" ...
##  $ is_repeated_guest             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : chr  "C" "C" "A" "A" ...
##  $ assigned_room_type            : chr  "C" "C" "C" "A" ...
##  $ booking_changes               : int  3 4 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : chr  "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
##  $ agent                         : chr  "NULL" "NULL" "NULL" "304" ...
##  $ company                       : chr  "NULL" "NULL" "NULL" "NULL" ...
##  $ days_in_waiting_list          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : chr  "Transient" "Transient" "Transient" "Transient" ...
##  $ adr                           : num  0 0 75 75 98 ...
##  $ required_car_parking_spaces   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ reservation_status            : chr  "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
##  $ reservation_status_date       : chr  "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
# Checking for values that don't make sense
summary(hotels_data)
##     hotel            is_canceled       lead_time   arrival_date_year
##  Length:119390      Min.   :0.0000   Min.   :  0   Min.   :2015     
##  Class :character   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
##  Mode  :character   Median :0.0000   Median : 69   Median :2016     
##                     Mean   :0.3704   Mean   :104   Mean   :2016     
##                     3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
##                     Max.   :1.0000   Max.   :737   Max.   :2017     
##                                                                     
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  Length:119390      Min.   : 1.00            Min.   : 1.0             
##  Class :character   1st Qu.:16.00            1st Qu.: 8.0             
##  Mode  :character   Median :28.00            Median :16.0             
##                     Mean   :27.17            Mean   :15.8             
##                     3rd Qu.:38.00            3rd Qu.:23.0             
##                     Max.   :53.00            Max.   :31.0             
##                                                                       
##  stays_in_weekend_nights stays_in_week_nights     adults      
##  Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
##  1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
##  Median : 1.0000         Median : 2.0         Median : 2.000  
##  Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
##  3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
##  Max.   :19.0000         Max.   :50.0         Max.   :55.000  
##                                                               
##     children           babies              meal             country         
##  Min.   : 0.0000   Min.   : 0.000000   Length:119390      Length:119390     
##  1st Qu.: 0.0000   1st Qu.: 0.000000   Class :character   Class :character  
##  Median : 0.0000   Median : 0.000000   Mode  :character   Mode  :character  
##  Mean   : 0.1039   Mean   : 0.007949                                        
##  3rd Qu.: 0.0000   3rd Qu.: 0.000000                                        
##  Max.   :10.0000   Max.   :10.000000                                        
##  NA's   :4                                                                  
##  market_segment     distribution_channel is_repeated_guest
##  Length:119390      Length:119390        Min.   :0.00000  
##  Class :character   Class :character     1st Qu.:0.00000  
##  Mode  :character   Mode  :character     Median :0.00000  
##                                          Mean   :0.03191  
##                                          3rd Qu.:0.00000  
##                                          Max.   :1.00000  
##                                                           
##  previous_cancellations previous_bookings_not_canceled reserved_room_type
##  Min.   : 0.00000       Min.   : 0.0000                Length:119390     
##  1st Qu.: 0.00000       1st Qu.: 0.0000                Class :character  
##  Median : 0.00000       Median : 0.0000                Mode  :character  
##  Mean   : 0.08712       Mean   : 0.1371                                  
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                                  
##  Max.   :26.00000       Max.   :72.0000                                  
##                                                                          
##  assigned_room_type booking_changes   deposit_type          agent          
##  Length:119390      Min.   : 0.0000   Length:119390      Length:119390     
##  Class :character   1st Qu.: 0.0000   Class :character   Class :character  
##  Mode  :character   Median : 0.0000   Mode  :character   Mode  :character  
##                     Mean   : 0.2211                                        
##                     3rd Qu.: 0.0000                                        
##                     Max.   :21.0000                                        
##                                                                            
##    company          days_in_waiting_list customer_type           adr         
##  Length:119390      Min.   :  0.000      Length:119390      Min.   :  -6.38  
##  Class :character   1st Qu.:  0.000      Class :character   1st Qu.:  69.29  
##  Mode  :character   Median :  0.000      Mode  :character   Median :  94.58  
##                     Mean   :  2.321                         Mean   : 101.83  
##                     3rd Qu.:  0.000                         3rd Qu.: 126.00  
##                     Max.   :391.000                         Max.   :5400.00  
##                                                                              
##  required_car_parking_spaces total_of_special_requests reservation_status
##  Min.   :0.00000             Min.   :0.0000            Length:119390     
##  1st Qu.:0.00000             1st Qu.:0.0000            Class :character  
##  Median :0.00000             Median :0.0000            Mode  :character  
##  Mean   :0.06252             Mean   :0.5714                              
##  3rd Qu.:0.00000             3rd Qu.:1.0000                              
##  Max.   :8.00000             Max.   :5.0000                              
##                                                                          
##  reservation_status_date
##  Length:119390          
##  Class :character       
##  Mode  :character       
##                         
##                         
##                         
## 
# Find all numeric variables 
numeric_data <- hotels_data[sapply(hotels_data, is.numeric)]
numeric_data_melted <- reshape2::melt(numeric_data)
## No id variables; using all as measure variables
# Create a boxplot for numeric variables
ggplot(numeric_data_melted, aes(y=value)) +
  geom_boxplot() +
  facet_wrap(~variable, scales = 'free', ncol=2) +
  labs(x='', y='')
## Warning: Removed 4 rows containing non-finite values (`stat_boxplot()`).

Data Preprocessing

Clean adr

Setting the outlier 5400 to the mean of adr

summary(hotels_data$adr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -6.38   69.29   94.58  101.83  126.00 5400.00
hotels_data[hotels_data$adr > 500, ] <- mean(hotels_data$adr)
hotels_data[hotels_data$adr > 500, ]
##  [1] hotel                          is_canceled                   
##  [3] lead_time                      arrival_date_year             
##  [5] arrival_date_month             arrival_date_week_number      
##  [7] arrival_date_day_of_month      stays_in_weekend_nights       
##  [9] stays_in_week_nights           adults                        
## [11] children                       babies                        
## [13] meal                           country                       
## [15] market_segment                 distribution_channel          
## [17] is_repeated_guest              previous_cancellations        
## [19] previous_bookings_not_canceled reserved_room_type            
## [21] assigned_room_type             booking_changes               
## [23] deposit_type                   agent                         
## [25] company                        days_in_waiting_list          
## [27] customer_type                  adr                           
## [29] required_car_parking_spaces    total_of_special_requests     
## [31] reservation_status             reservation_status_date       
## <0 rows> (or 0-length row.names)

Clean arrival_date_year

Found arrival year date min = 0 leading to all var in these obs as 0. should remove

summary(hotels_data$arrival_date_year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   101.8  2016.0  2016.0  2016.1  2017.0  2017.0
unique(hotels_data$arrival_date_year)
## [1] 2015.0000 2016.0000 2017.0000  101.8311
hotels_data[hotels_data$arrival_date_year == 0, ]
##  [1] hotel                          is_canceled                   
##  [3] lead_time                      arrival_date_year             
##  [5] arrival_date_month             arrival_date_week_number      
##  [7] arrival_date_day_of_month      stays_in_weekend_nights       
##  [9] stays_in_week_nights           adults                        
## [11] children                       babies                        
## [13] meal                           country                       
## [15] market_segment                 distribution_channel          
## [17] is_repeated_guest              previous_cancellations        
## [19] previous_bookings_not_canceled reserved_room_type            
## [21] assigned_room_type             booking_changes               
## [23] deposit_type                   agent                         
## [25] company                        days_in_waiting_list          
## [27] customer_type                  adr                           
## [29] required_car_parking_spaces    total_of_special_requests     
## [31] reservation_status             reservation_status_date       
## <0 rows> (or 0-length row.names)
# Keep the rows where arrival_date_year is not 0
hotels_data <- hotels_data[hotels_data$arrival_date_year != 0, ]

# Verification step
summary(hotels_data$arrival_date_year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   101.8  2016.0  2016.0  2016.1  2017.0  2017.0
unique(hotels_data$arrival_date_year)
## [1] 2015.0000 2016.0000 2017.0000  101.8311

Clean adults

summary(hotels_data$adults)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   2.000   1.859   2.000 101.831
unique(hotels_data$adults)
##  [1]   2.0000   1.0000   3.0000   4.0000  40.0000  26.0000  50.0000  27.0000
##  [9]  55.0000   0.0000  20.0000   6.0000   5.0000  10.0000 101.8311
#Adults summary values seems off with a max number of 55
unique(hotels_data$adults)
##  [1]   2.0000   1.0000   3.0000   4.0000  40.0000  26.0000  50.0000  27.0000
##  [9]  55.0000   0.0000  20.0000   6.0000   5.0000  10.0000 101.8311
# All of the obs with adults > 4 is cancelled. These obs will be removed assuming it was a typo error then cancellation to correct
head(hotels_data[hotels_data$adults > 4, "reservation_status"])
## [1] "Canceled" "Canceled" "Canceled" "Canceled" "Canceled" "Canceled"
# Believed the 0 adults counts could be an error, but the reservation status shows check-out and the other variables work so it appears to be an actual booking just missing the guest count
summary(hotels_data[hotels_data$adults == 0,])
##     hotel            is_canceled       lead_time      arrival_date_year
##  Length:403         Min.   :0.0000   Min.   :  0.00   Min.   :2015     
##  Class :character   1st Qu.:0.0000   1st Qu.:  1.00   1st Qu.:2016     
##  Mode  :character   Median :0.0000   Median : 44.00   Median :2016     
##                     Mean   :0.2705   Mean   : 86.56   Mean   :2016     
##                     3rd Qu.:1.0000   3rd Qu.:153.50   3rd Qu.:2017     
##                     Max.   :1.0000   Max.   :434.00   Max.   :2017     
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  Length:403         Min.   : 1.00            Min.   : 1.00            
##  Class :character   1st Qu.:13.00            1st Qu.: 8.50            
##  Mode  :character   Median :28.00            Median :16.00            
##                     Mean   :27.47            Mean   :16.23            
##                     3rd Qu.:40.50            3rd Qu.:24.00            
##                     Max.   :53.00            Max.   :31.00            
##  stays_in_weekend_nights stays_in_week_nights     adults     children    
##  Min.   : 0.000          Min.   : 0.000       Min.   :0   Min.   :0.000  
##  1st Qu.: 0.000          1st Qu.: 1.000       1st Qu.:0   1st Qu.:0.000  
##  Median : 1.000          Median : 2.000       Median :0   Median :2.000  
##  Mean   : 1.119          Mean   : 2.871       Mean   :0   Mean   :1.124  
##  3rd Qu.: 2.000          3rd Qu.: 4.000       3rd Qu.:0   3rd Qu.:2.000  
##  Max.   :16.000          Max.   :41.000       Max.   :0   Max.   :3.000  
##      babies             meal             country          market_segment    
##  Min.   :0.000000   Length:403         Length:403         Length:403        
##  1st Qu.:0.000000   Class :character   Class :character   Class :character  
##  Median :0.000000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.007444                                                           
##  3rd Qu.:0.000000                                                           
##  Max.   :1.000000                                                           
##  distribution_channel is_repeated_guest previous_cancellations
##  Length:403           Min.   :0.000     Min.   :0.00000       
##  Class :character     1st Qu.:0.000     1st Qu.:0.00000       
##  Mode  :character     Median :0.000     Median :0.00000       
##                       Mean   :0.139     Mean   :0.02233       
##                       3rd Qu.:0.000     3rd Qu.:0.00000       
##                       Max.   :1.000     Max.   :2.00000       
##  previous_bookings_not_canceled reserved_room_type assigned_room_type
##  Min.   :0.00000                Length:403         Length:403        
##  1st Qu.:0.00000                Class :character   Class :character  
##  Median :0.00000                Mode  :character   Mode  :character  
##  Mean   :0.06452                                                     
##  3rd Qu.:0.00000                                                     
##  Max.   :9.00000                                                     
##  booking_changes   deposit_type          agent             company         
##  Min.   : 0.0000   Length:403         Length:403         Length:403        
##  1st Qu.: 0.0000   Class :character   Class :character   Class :character  
##  Median : 0.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 0.9876                                                           
##  3rd Qu.: 1.0000                                                           
##  Max.   :21.0000                                                           
##  days_in_waiting_list customer_type           adr        
##  Min.   :  0.000      Length:403         Min.   :  0.00  
##  1st Qu.:  0.000      Class :character   1st Qu.:  0.00  
##  Median :  0.000      Mode  :character   Median : 56.27  
##  Mean   :  1.017                         Mean   : 49.58  
##  3rd Qu.:  0.000                         3rd Qu.: 91.85  
##  Max.   :122.000                         Max.   :200.00  
##  required_car_parking_spaces total_of_special_requests reservation_status
##  Min.   :0.00000             Min.   :0.0000            Length:403        
##  1st Qu.:0.00000             1st Qu.:0.0000            Class :character  
##  Median :0.00000             Median :1.0000            Mode  :character  
##  Mean   :0.02233             Mean   :0.7841                              
##  3rd Qu.:0.00000             3rd Qu.:1.0000                              
##  Max.   :1.00000             Max.   :4.0000                              
##  reservation_status_date
##  Length:403             
##  Class :character       
##  Mode  :character       
##                         
##                         
## 
hotels_data <- hotels_data[hotels_data$adults < 5, ]

# Verification step
summary(hotels_data$adults)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   2.000   1.853   2.000   4.000
unique(hotels_data$adults)
## [1] 2 1 3 4 0

Clean babies

summary(hotels_data$babies)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.00795  0.00000 10.00000
unique(hotels_data$babies)
## [1]  0  1  2 10  9
# Babies var seems off. There are 9 and 10 counts. This appears to be a one-off error. changing value to 0
unique(hotels_data$babies)
## [1]  0  1  2 10  9
hotels_data[hotels_data$babies == 10, ] <- 0
hotels_data[hotels_data$babies == 9, ] <- 0
unique(hotels_data$babies)
## [1] 0 1 2
# Verification step
summary(hotels_data$babies)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.007791 0.000000 2.000000
unique(hotels_data$babies)
## [1] 0 1 2

Reviewing NA values

Found 4 NA in children and changed them to 0.

# checking for NA
colSums(is.na(hotels_data))
##                          hotel                    is_canceled 
##                              0                              0 
##                      lead_time              arrival_date_year 
##                              0                              0 
##             arrival_date_month       arrival_date_week_number 
##                              0                              0 
##      arrival_date_day_of_month        stays_in_weekend_nights 
##                              0                              0 
##           stays_in_week_nights                         adults 
##                              0                              0 
##                       children                         babies 
##                              4                              0 
##                           meal                        country 
##                              0                              0 
##                 market_segment           distribution_channel 
##                              0                              0 
##              is_repeated_guest         previous_cancellations 
##                              0                              0 
## previous_bookings_not_canceled             reserved_room_type 
##                              0                              0 
##             assigned_room_type                booking_changes 
##                              0                              0 
##                   deposit_type                          agent 
##                              0                              0 
##                        company           days_in_waiting_list 
##                              0                              0 
##                  customer_type                            adr 
##                              0                              0 
##    required_car_parking_spaces      total_of_special_requests 
##                              0                              0 
##             reservation_status        reservation_status_date 
##                              0                              0
# 4 NA found in children. We will assume NA means 0 children
hotels_data[is.na(hotels_data$children),] <- 0

colSums(is.na(hotels_data))
##                          hotel                    is_canceled 
##                              0                              0 
##                      lead_time              arrival_date_year 
##                              0                              0 
##             arrival_date_month       arrival_date_week_number 
##                              0                              0 
##      arrival_date_day_of_month        stays_in_weekend_nights 
##                              0                              0 
##           stays_in_week_nights                         adults 
##                              0                              0 
##                       children                         babies 
##                              0                              0 
##                           meal                        country 
##                              0                              0 
##                 market_segment           distribution_channel 
##                              0                              0 
##              is_repeated_guest         previous_cancellations 
##                              0                              0 
## previous_bookings_not_canceled             reserved_room_type 
##                              0                              0 
##             assigned_room_type                booking_changes 
##                              0                              0 
##                   deposit_type                          agent 
##                              0                              0 
##                        company           days_in_waiting_list 
##                              0                              0 
##                  customer_type                            adr 
##                              0                              0 
##    required_car_parking_spaces      total_of_special_requests 
##                              0                              0 
##             reservation_status        reservation_status_date 
##                              0                              0

Correlation Plot

corr_matrix <- cor(hotels_data[sapply(hotels_data, is.numeric)])
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
         tl.col="black", tl.srt=45)

1. What variables are most strongly correlated with target?

Upon review of the above correlation plot we can see that:

Strongest Positive Correlations - Visual review:

  • children ~ adr
  • adults ~ adr
  • stays_in_weekend_nights ~ stays_in_weeks_nights
  • is_canceled ~ lead_time

Strongest Negative Correlations - Visual review:

  • required_car_parking_spaces ~ is_canceled
  • booking_changes ~ is_canceled
  • required_car_parking_spaces ~ lead_time
  • adr ~ is_repeated_guest
  • adults ~ is_repeated_guest
  • total_of_special_requests ~ is_canceled
  • is_repeated_guest ~ lead_time

Target Variable adr Correlations * children ~ adr * adults ~ adr * adr ~ is_repeated_guest

Create training and testing sets

# Set the seed
set.seed(21)

# Randomly sample row indices for the training set
train_indices <- sample(1:NROW(hotels_data),NROW(hotels_data)*0.75)
 
# Create the training set
train_data <- hotels_data[train_indices, ]
numeric_train_data <- train_data[sapply(train_data, is.numeric)]

# Create the testing set
test_data <- hotels_data[-train_indices, ]
numeric_test_data <- test_data[sapply(test_data, is.numeric)]

Linear Regression

Winsorization function

# Winsorize all numeric variables in the dataframe
winsorize_all <- function(data, lower = 0.05, upper = 0.95) {
  data <- lapply(data, function(x) Winsorize(x, probs = c(lower, upper)))
  return(data)
}

Target variable adr - Average Daily Rate

In sample linear model/winsorization comparison

hotels_train_data_winsorized <- winsorize_all(numeric_train_data)
summary(hotels_train_data_winsorized)
##                                Length Class  Mode   
## is_canceled                    89528  -none- numeric
## lead_time                      89528  -none- numeric
## arrival_date_year              89528  -none- numeric
## arrival_date_week_number       89528  -none- numeric
## arrival_date_day_of_month      89528  -none- numeric
## stays_in_weekend_nights        89528  -none- numeric
## stays_in_week_nights           89528  -none- numeric
## adults                         89528  -none- numeric
## children                       89528  -none- numeric
## babies                         89528  -none- numeric
## is_repeated_guest              89528  -none- numeric
## previous_cancellations         89528  -none- numeric
## previous_bookings_not_canceled 89528  -none- numeric
## booking_changes                89528  -none- numeric
## days_in_waiting_list           89528  -none- numeric
## adr                            89528  -none- numeric
## required_car_parking_spaces    89528  -none- numeric
## total_of_special_requests      89528  -none- numeric
lm_model_train <- lm(adr ~ ., data = numeric_train_data)
summary(lm_model_train)
## 
## Call:
## lm(formula = adr ~ ., data = numeric_train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -203.02  -27.48   -3.88   22.97  357.01 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -61.950337  18.641087  -3.323 0.000890 ***
## is_canceled                     10.166041   0.318145  31.954  < 2e-16 ***
## lead_time                       -0.055063   0.001440 -38.251  < 2e-16 ***
## arrival_date_year                0.049471   0.009250   5.348 8.90e-08 ***
## arrival_date_week_number         0.291615   0.010383  28.086  < 2e-16 ***
## arrival_date_day_of_month        0.104052   0.015933   6.531 6.58e-11 ***
## stays_in_weekend_nights         -0.551033   0.162074  -3.400 0.000674 ***
## stays_in_week_nights             0.792110   0.085528   9.261  < 2e-16 ***
## adults                          25.473679   0.298397  85.368  < 2e-16 ***
## children                        37.614816   0.353622 106.370  < 2e-16 ***
## babies                           0.048070   1.572429   0.031 0.975612    
## is_repeated_guest              -24.643135   0.886712 -27.792  < 2e-16 ***
## previous_cancellations          -2.717740   0.168025 -16.175  < 2e-16 ***
## previous_bookings_not_canceled   0.074429   0.100389   0.741 0.458451    
## booking_changes                  1.750363   0.219326   7.981 1.47e-15 ***
## days_in_waiting_list            -0.025282   0.007888  -3.205 0.001350 ** 
## required_car_parking_spaces      7.397572   0.587955  12.582  < 2e-16 ***
## total_of_special_requests        7.148434   0.186357  38.359  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.71 on 89510 degrees of freedom
## Multiple R-squared:  0.2474, Adjusted R-squared:  0.2473 
## F-statistic:  1731 on 17 and 89510 DF,  p-value: < 2.2e-16
lm_model_train_win <- lm(adr ~ ., data = hotels_train_data_winsorized)
summary(lm_model_train_win)
## 
## Call:
## lm(formula = adr ~ ., data = hotels_train_data_winsorized)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -134.293  -24.006   -3.275   22.339  137.027 
## 
## Coefficients: (4 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -3.621e+04  4.280e+02 -84.613  < 2e-16 ***
## is_canceled                     9.284e+00  2.759e-01  33.651  < 2e-16 ***
## lead_time                      -6.720e-02  1.375e-03 -48.881  < 2e-16 ***
## arrival_date_year               1.798e+01  2.122e-01  84.711  < 2e-16 ***
## arrival_date_week_number        8.119e-01  1.092e-02  74.334  < 2e-16 ***
## arrival_date_day_of_month       5.217e-02  1.351e-02   3.861 0.000113 ***
## stays_in_weekend_nights         1.827e-03  1.442e-01   0.013 0.989893    
## stays_in_week_nights            1.281e+00  8.738e-02  14.658  < 2e-16 ***
## adults                          2.191e+01  2.563e-01  85.472  < 2e-16 ***
## children                        3.958e+01  4.615e-01  85.771  < 2e-16 ***
## babies                                 NA         NA      NA       NA    
## is_repeated_guest                      NA         NA      NA       NA    
## previous_cancellations         -5.267e+00  5.756e-01  -9.149  < 2e-16 ***
## previous_bookings_not_canceled         NA         NA      NA       NA    
## booking_changes                 2.637e+00  3.369e-01   7.827 5.04e-15 ***
## days_in_waiting_list                   NA         NA      NA       NA    
## required_car_parking_spaces     5.265e+00  5.000e-01  10.530  < 2e-16 ***
## total_of_special_requests       5.031e+00  1.751e-01  28.728  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.08 on 89514 degrees of freedom
## Multiple R-squared:  0.2795, Adjusted R-squared:  0.2794 
## F-statistic:  2671 on 13 and 89514 DF,  p-value: < 2.2e-16
comparison_df <- data.frame(Actual = numeric_train_data$adr, lm_predicted = lm_model_train$fitted.values)
head(comparison_df)
##        Actual lm_predicted
## 16562    81.0     96.45532
## 12105    80.1    103.70450
## 76839    89.0    104.74265
## 55685   115.0     96.35001
## 106096  105.0     90.61195
## 68024   130.0     97.38073
comparison_df <- data.frame(Actual = hotels_train_data_winsorized$adr, lm_predicted = lm_model_train_win$fitted.values)
head(comparison_df)
##   Actual lm_predicted
## 1   81.0     78.39981
## 2   80.1    114.89974
## 3   89.0     87.32924
## 4  115.0     93.52211
## 5  105.0     96.42189
## 6  130.0    107.44216

In sample MSE comparison to winsorization model

lm_mse_train <- mean((lm_model_train$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Training MSE for Linear Model: 1739.62"
lm_mse_train_win <- mean((lm_model_train_win$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Winsorized Linear Model:", round(lm_mse_train_win, 2)))
## [1] "Training MSE for Winsorized Linear Model: 1671.39"

Out of sample linear model/winsorization comparison

5. Is there any outlier in the dataset? If yes, apply truncation or winsorization techniques to handle outliers. Compare the performance of the models before and after applying these techniques. What differences do you observe?

There do appear to be outliers. Applying winzorisation did lower the training and testing MSE for the linear regression models.

hotels_test_data_winsorized <- winsorize_all(numeric_test_data)
summary(hotels_test_data_winsorized)
##                                Length Class  Mode   
## is_canceled                    29843  -none- numeric
## lead_time                      29843  -none- numeric
## arrival_date_year              29843  -none- numeric
## arrival_date_week_number       29843  -none- numeric
## arrival_date_day_of_month      29843  -none- numeric
## stays_in_weekend_nights        29843  -none- numeric
## stays_in_week_nights           29843  -none- numeric
## adults                         29843  -none- numeric
## children                       29843  -none- numeric
## babies                         29843  -none- numeric
## is_repeated_guest              29843  -none- numeric
## previous_cancellations         29843  -none- numeric
## previous_bookings_not_canceled 29843  -none- numeric
## booking_changes                29843  -none- numeric
## days_in_waiting_list           29843  -none- numeric
## adr                            29843  -none- numeric
## required_car_parking_spaces    29843  -none- numeric
## total_of_special_requests      29843  -none- numeric
lm_model_test <- lm(adr ~ ., data = numeric_test_data)
summary(lm_model_test)
## 
## Call:
## lm(formula = adr ~ ., data = numeric_test_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -344.76  -27.54   -3.82   22.96  280.85 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1.054e+02  4.195e+01  -2.512 0.012005 *  
## is_canceled                     1.032e+01  5.584e-01  18.479  < 2e-16 ***
## lead_time                      -5.320e-02  2.484e-03 -21.421  < 2e-16 ***
## arrival_date_year               7.043e-02  2.081e-02   3.384 0.000715 ***
## arrival_date_week_number        2.822e-01  1.806e-02  15.628  < 2e-16 ***
## arrival_date_day_of_month       1.065e-01  2.781e-02   3.828 0.000129 ***
## stays_in_weekend_nights        -1.015e+00  2.821e-01  -3.597 0.000323 ***
## stays_in_week_nights            9.450e-01  1.508e-01   6.266 3.77e-10 ***
## adults                          2.600e+01  5.225e-01  49.759  < 2e-16 ***
## children                        3.567e+01  6.136e-01  58.127  < 2e-16 ***
## babies                          1.848e+00  2.771e+00   0.667 0.504813    
## is_repeated_guest              -2.449e+01  1.571e+00 -15.590  < 2e-16 ***
## previous_cancellations         -2.769e+00  3.005e-01  -9.215  < 2e-16 ***
## previous_bookings_not_canceled -1.982e-02  2.034e-01  -0.097 0.922406    
## booking_changes                 1.420e+00  3.801e-01   3.736 0.000187 ***
## days_in_waiting_list           -1.374e-02  1.529e-02  -0.898 0.369058    
## required_car_parking_spaces     7.207e+00  1.001e+00   7.197 6.29e-13 ***
## total_of_special_requests       7.653e+00  3.256e-01  23.504  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42 on 29825 degrees of freedom
## Multiple R-squared:  0.2421, Adjusted R-squared:  0.2416 
## F-statistic: 560.3 on 17 and 29825 DF,  p-value: < 2.2e-16
lm_model_test_win <- lm(adr ~ ., data = hotels_test_data_winsorized)
summary(lm_model_test_win)
## 
## Call:
## lm(formula = adr ~ ., data = hotels_test_data_winsorized)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -133.368  -24.168   -3.051   22.421  124.130 
## 
## Coefficients: (4 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -3.732e+04  7.429e+02 -50.240  < 2e-16 ***
## is_canceled                     9.469e+00  4.818e-01  19.652  < 2e-16 ***
## lead_time                      -6.696e-02  2.373e-03 -28.213  < 2e-16 ***
## arrival_date_year               1.853e+01  3.684e-01  50.295  < 2e-16 ***
## arrival_date_week_number        8.299e-01  1.900e-02  43.684  < 2e-16 ***
## arrival_date_day_of_month       5.633e-02  2.347e-02   2.400   0.0164 *  
## stays_in_weekend_nights        -6.319e-01  2.504e-01  -2.524   0.0116 *  
## stays_in_week_nights            1.333e+00  1.519e-01   8.776  < 2e-16 ***
## adults                          2.257e+01  4.471e-01  50.481  < 2e-16 ***
## children                        3.800e+01  7.975e-01  47.652  < 2e-16 ***
## babies                                 NA         NA      NA       NA    
## is_repeated_guest                      NA         NA      NA       NA    
## previous_cancellations         -5.639e+00  1.019e+00  -5.535 3.13e-08 ***
## previous_bookings_not_canceled         NA         NA      NA       NA    
## booking_changes                 2.786e+00  5.844e-01   4.767 1.88e-06 ***
## days_in_waiting_list                   NA         NA      NA       NA    
## required_car_parking_spaces     4.146e+00  8.605e-01   4.818 1.46e-06 ***
## total_of_special_requests       5.417e+00  3.037e-01  17.835  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.17 on 29829 degrees of freedom
## Multiple R-squared:  0.2814, Adjusted R-squared:  0.281 
## F-statistic: 898.3 on 13 and 29829 DF,  p-value: < 2.2e-16
comparison_df <- data.frame(Actual = numeric_test_data$adr, lm_predicted = lm_model_test$fitted.values)
head(comparison_df)
##    Actual lm_predicted
## 8  103.00    105.31938
## 10 105.50    105.41917
## 23  84.67    103.24816
## 33 108.30    122.30285
## 37  98.00     97.38399
## 39 108.80    132.21362
comparison_df_win <- data.frame(Actual = hotels_test_data_winsorized$adr, lm_predicted = lm_model_test_win$fitted.values)
head(comparison_df)
##    Actual lm_predicted
## 8  103.00    105.31938
## 10 105.50    105.41917
## 23  84.67    103.24816
## 33 108.30    122.30285
## 37  98.00     97.38399
## 39 108.80    132.21362

MSE comparison of In sample and out of sample testing and training models

lm_mse_train <- mean((lm_model_train$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Linear Model:", round(lm_mse_train, 2)))
## [1] "Training MSE for Linear Model: 1739.62"
lm_mse_train_win <- mean((lm_model_train_win$fitted.values - numeric_train_data$adr)^2)
print(paste("Training MSE for Winsorized Linear Model:", round(lm_mse_train_win, 2)))
## [1] "Training MSE for Winsorized Linear Model: 1671.39"
lm_mse_test <- mean((lm_model_test$fitted.values - numeric_test_data$adr)^2)
print(paste("Test MSE for Linear Model:", round(lm_mse_test, 2)))
## [1] "Test MSE for Linear Model: 1763.21"
lm_mse_test_win <- mean((lm_model_test_win$fitted.values - numeric_test_data$adr)^2)
print(paste("Test MSE for Winsorized Linear Model:", round(lm_mse_test_win, 2)))
## [1] "Test MSE for Winsorized Linear Model: 1679.17"

Linear testing on correlated variables

3. What assumptions are being made when we use linear regression? Are they met in this dataset? Just describe what you observe from the diagnostic plots.

We are assuming the relationship between the two variables is linear, the variance is equal across the regression line, and normality. The variance is large, but appears to be equal across the variables. JHowever, normality does not appear to be present.

ggplot(numeric_train_data, aes(x = adults, y = adr)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Linear Regression Average Daily Rate ~ Adults")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(numeric_train_data, aes(x = children, y = adr)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Linear Regression Average Daily Rate ~ Children")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(numeric_train_data, aes(x = is_repeated_guest, y = adr)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  ggtitle("Linear Regression Average Daily Rate ~ Is Repeated Guest")
## `geom_smooth()` using formula = 'y ~ x'

4. Try adding interaction terms to your linear regression model. At least try to find out one interaction term that has a statistically significant coefficient. Report the interaction term and check how these interaction terms influence the model’s performance in terms of R^2 and how do you interpret your new model?

Adding the interaction between adults and is_repeated_guest increased the R^2 from .09295 to .1015 showing a correlation.

lm_model_test <- lm(adr ~ adults, data = numeric_test_data)
summary(lm_model_test)
## 
## Call:
## lm(formula = adr ~ adults, data = numeric_test_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -136.19  -31.10   -6.19   23.00  330.90 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.9101     1.0436   43.99   <2e-16 ***
## adults       30.0942     0.5442   55.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.94 on 29841 degrees of freedom
## Multiple R-squared:  0.09295,    Adjusted R-squared:  0.09292 
## F-statistic:  3058 on 1 and 29841 DF,  p-value: < 2.2e-16
lm_model_test <- lm(adr ~ adults + adults*is_repeated_guest, data = numeric_test_data)
summary(lm_model_test)
## 
## Call:
## lm(formula = adr ~ adults + adults * is_repeated_guest, data = numeric_test_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.35  -31.71   -6.71   22.74  330.29 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               49.4140     1.0857  45.514  < 2e-16 ***
## adults                    28.6465     0.5624  50.934  < 2e-16 ***
## is_repeated_guest        -19.4695     4.1174  -4.729 2.27e-06 ***
## adults:is_repeated_guest  -4.2869     2.7291  -1.571    0.116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.72 on 29839 degrees of freedom
## Multiple R-squared:  0.1015, Adjusted R-squared:  0.1014 
## F-statistic:  1123 on 3 and 29839 DF,  p-value: < 2.2e-16
lm_model_train_win <- lm(adr ~ adults + is_repeated_guest + adults * is_repeated_guest, data = hotels_train_data_winsorized)
summary(lm_model_train_win)
## 
## Call:
## lm(formula = adr ~ adults + is_repeated_guest + adults * is_repeated_guest, 
##     data = hotels_train_data_winsorized)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -91.686 -29.789  -5.589  24.359 114.679 
## 
## Coefficients: (2 not defined because of singularities)
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               53.0536     0.5289  100.32   <2e-16 ***
## adults                    25.7674     0.2760   93.37   <2e-16 ***
## is_repeated_guest              NA         NA      NA       NA    
## adults:is_repeated_guest       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.45 on 89526 degrees of freedom
## Multiple R-squared:  0.08873,    Adjusted R-squared:  0.08872 
## F-statistic:  8717 on 1 and 89526 DF,  p-value: < 2.2e-16

KNN Model

Target variable adr - Average Daily Rate

 knn_train_model_win <- kknn(adr ~ ., train = hotels_train_data_winsorized, test = hotels_train_data_winsorized, k = 14)
 
 
 knn_train_pred <- fitted.values(knn_train_model_win)
 
# Calculate in-sample MSE manually
knn_train_mse <- mean((hotels_train_data_winsorized$adr - knn_train_pred)^2)
print(paste("In-Sample MSE for KNN: ", knn_train_mse))
## [1] "In-Sample MSE for KNN:  424.353832972877"

Fitting k value for optimal usage

6. How could feature scaling (standardization) affect the KNN model?

Using standardized scaling will make the KNN comparisons much easier to work since the variances are on the same scale.

# mse_df <- data.frame(k = integer(), MSE_train = numeric(), MSE_test = numeric())
# 
# for (k in c(9, 11, 12, 13, 14, 15)) {
#   
#   # Fit the k-NN model using training data
#   knn_model_train <- kknn(adr ~ ., train = hotels_train_data_winsorized, test = hotels_train_data_winsorized, k = k)
#   
#   # Calculate the training MSE
#   mse_train <-  mean((knn_model_train$fitted.values - hotels_train_data_winsorized$adr)^2)
#   
#   
#   # Test the k-NN with testing data
#   knn_model_test <- kknn(adr ~ ., train = hotels_train_data_winsorized, test = hotels_test_data_winsorized, k = k)
#   mse_test <-  mean((knn_model_test$fitted.values - hotels_test_data_winsorized$adr)^2)
#   
#   # save the results
#   mse_df <- rbind(mse_df, data.frame(k = k, MSE_train = mse_train, MSE_test = mse_test))
#   
# }
# 
# # Show the MSE dataframe
# print(mse_df)

2. How does the value of k in KNN affect the model’s performance (in terms of training MSE and testing MSE)?

I ran the above code to find the most viable k # based on the training/testing MSE. It appears k = 14 would be the best fit to maximize flexibility, but also avoid overfitting to the training set.

7. What insights can you derive from comparing the linear regression and KNN models? Which model would you recommend the most and Why?

The KNN model takes a very long time to complete on such a large dataset. Linear regression based on the findings makes more sense since it is so easy to overfit the KNN models.