library(dplyr)
library(ggplot2)
library(caret)
library(superml)
df<- read.csv("hotel_bookings.csv")
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr "July" "July" "July" "July" ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr "BB" "BB" "BB" "BB" ...
## $ country : chr "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr "C" "C" "A" "A" ...
## $ assigned_room_type : chr "C" "C" "C" "A" ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr "NULL" "NULL" "NULL" "304" ...
## $ company : chr "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
#Check if there is any missing value
any(is.na(df))
## [1] TRUE
#Count missing value from the entire dataframe
sum(is.na(df))
## [1] 4
#Overview of null value from each column
colSums(is.na(df))
## hotel is_canceled
## 0 0
## lead_time arrival_date_year
## 0 0
## arrival_date_month arrival_date_week_number
## 0 0
## arrival_date_day_of_month stays_in_weekend_nights
## 0 0
## stays_in_week_nights adults
## 0 0
## children babies
## 4 0
## meal country
## 0 0
## market_segment distribution_channel
## 0 0
## is_repeated_guest previous_cancellations
## 0 0
## previous_bookings_not_canceled reserved_room_type
## 0 0
## assigned_room_type booking_changes
## 0 0
## deposit_type agent
## 0 0
## company days_in_waiting_list
## 0 0
## customer_type adr
## 0 0
## required_car_parking_spaces total_of_special_requests
## 0 0
## reservation_status reservation_status_date
## 0 0
#Typically, there are 3 most common techniques to deal with missing values, i.e removing it, replacing with constants, or predicting using algorithms. In this case, we only have 4 missing values for children attribute, which is a very small value, thus we can simply remove them.
df <- na.omit(df)
#Recheck on missing value
any(is.na(df))
## [1] FALSE
#Check if there is any duplicate record
any(duplicated(df))
## [1] TRUE
#Count duplicated rows
sum(duplicated(df))
## [1] 31994
#Identify number of rows before handling the duplicated rows
nrow(df)
## [1] 119386
# Handle the duplicated rows by removing it
df <- distinct(df)
#Check number of rows after removing duplicated rows
nrow(df)
## [1] 87392
#Recheck on duplicated rows
any(duplicated(df))
## [1] FALSE
nrow(df[df$adults == 0 & df$children == 0 & df$babies== 0,])
## [1] 166
nrow(df[df$stays_in_weekend_nights== 0 & df$stays_in_week_nights == 0,])
## [1] 651
nrow(df[df$adults== 0 ,])
## [1] 385
#Handling all of these unreliable data
df<-df[!(df$adults == 0 & df$children == 0 & df$babies== 0),]
df<-df[!(df$stays_in_weekend_nights== 0 & df$stays_in_week_nights == 0),]
df<-df[!(df$adults== 0),]
#Check available rows
nrow(df)
## [1] 86416
df <- subset(df, select = -c(reservation_status,required_car_parking_spaces) )
country_df <- df[df$is_canceled == 0,]
country_df <- head(country_df %>% count(country,sort=TRUE),n=10)
country_df
## country n
## 1 PRT 17063
## 2 GBR 8406
## 3 FRA 7062
## 4 ESP 5357
## 5 DEU 4322
## 6 IRL 2344
## 7 ITA 1978
## 8 BEL 1655
## 9 NLD 1554
## 10 USA 1404
# Plot the bar chart
barplot(country_df$n,names.arg=country_df$country,xlab="Country",ylab="Number of Reservation",col="yellow",
main="Top 10 Reservation based on Guest's Country",border="red")
Insights: Both Resort and City hotel received a lot of reservation from Portugal, followed by United Kingdom, France, Spain, Germany and etc.
# Extract data where the booking is canceled but deposit is non-refundable
ms1 <- df[df$is_canceled == 0 & df$deposit_type == "Non Refund",]
ms1 <- select(df, hotel, arrival_date_year, market_segment, adr)
# Extract data where guest completed their stay in hotel
ms2 <- df[df$is_canceled == 0,]
ms2 <- select(df, hotel, arrival_date_year, market_segment, adr)
# Combine dataframe ms1 and ms2
ms <- rbind(ms1,ms2)
ms <- ms %>%
group_by(market_segment) %>%
summarise(Revenue = sum(adr))
ms <- ms[order(ms$Revenue, decreasing=TRUE),]
ms <- head(ms, 5)
ms
## # A tibble: 5 × 2
## market_segment Revenue
## <chr> <dbl>
## 1 Online TA 12164432.
## 2 Direct 2748546.
## 3 Offline TA/TO 2270686.
## 4 Groups 739952.
## 5 Corporate 574092.
# Get the library.
library(plotrix)
# Plot the chart.
pie3D(ms$Revenue,labels = ms$market_segment, main = "Hotel Revenue By Market Segment",explode = 0.1)
Considering there is a targeted variable in dataset which indicates whether the booking is canceled or not (binary case), this dataset can be fitted into classifier algorithm to predict future cancellation.
#reserved_room_type and assigned_room_type is similar feature. We can combine these into one feature only. We will put 1 if the guest was assigned to the same room that they reserved and 0 if its different.
df2 <- df
df2["room_given"]=0
df2 <- df2 %>%
mutate(room_given = if_else(reserved_room_type == assigned_room_type, 1, 0))
#previous_cancellations and previous_bookings_not_canceled describes about cancellation. We can combine these into one feature only. We will put 1 if previous_cancellations is higher than previous_bookings_not_canceled and 0 if its not
df2 <- df2 %>%
mutate(cancellation_tendency = if_else(previous_cancellations > previous_bookings_not_canceled, 1, 0))
df2 <- subset(df2, select = -c(reserved_room_type, assigned_room_type,previous_cancellations, previous_bookings_not_canceled ))
In this section, we have partitioned the data into train and test sets with 70-30 ratio respectively. The training set will be used for model development where the machine learning models will identify the cancellation pattern while the test set will be utilised to evaluate the performance of our models.
set.seed(50)
df2$is_canceled <-as.factor(df2$is_canceled)
trainIndex <- createDataPartition(df2$is_canceled, p = .7,list = FALSE,times = 1)
train_data<- df2[ trainIndex,]
test_data <- df2[-trainIndex,]
table(train_data$is_canceled)
##
## 0 1
## 43761 16731
Most of data in real-world are imbalance in nature. This occurs when the distribution of the target class is not uniform among the different class levels. For classifier, this will lead to a huge challenge in detecting the class of interest which resides on the minority class. For our case, we are interested in knowing whether the guest will cancel their hotel reservation or not, in which the positive class (cancel reservation) is in minority group.
data <- data.frame(
name=c("A","B","C","D","E") ,
value=c(3,12,5,18,45)
)
# Barplot
ggplot(df2, aes(x=is_canceled)) +
geom_bar(color="red")
To handle class imbalanced problem, we used downSample. downSample will randomly eliminate the sample from majority class to make the class distributions equal.
set.seed(50)
#DownSampling
train_data_down<-downSample(x=train_data[,-ncol(train_data)],
y=train_data$is_canceled)
table(train_data_down$is_canceled)
##
## 0 1
## 16731 16731
train_data_down <- subset(train_data_down, select = -c(Class))
#####RANDOM FOREST
Random forests or random decision forests is an ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. We are going to use Random Forest Model to predict the booking cancellation pattern.
#import library randomForest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
#Build the classfication model using randomForest
set.seed(45)
#For train data
model.rf = randomForest(is_canceled ~ ., data = train_data,
keep.forest = TRUE, ntree = 150)
#For train data down
model_down.rf = randomForest(is_canceled ~ ., data = train_data_down,
keep.forest = TRUE, ntree = 150)
Display the model
#print the train data model
print(model.rf)
##
## Call:
## randomForest(formula = is_canceled ~ ., data = train_data, keep.forest = TRUE, ntree = 150)
## Type of random forest: classification
## Number of trees: 150
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 8.31%
## Confusion matrix:
## 0 1 class.error
## 0 42542 1219 0.02785585
## 1 3807 12924 0.22754169
#print train data down model
print(model_down.rf)
##
## Call:
## randomForest(formula = is_canceled ~ ., data = train_data_down, keep.forest = TRUE, ntree = 150)
## Type of random forest: classification
## Number of trees: 150
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 11.87%
## Confusion matrix:
## 0 1 class.error
## 0 14912 1819 0.1087203
## 1 2153 14578 0.1286833
We can plot the model to show the class error rate. As the number of trees increases, the error rate approaches zero.
#plot the train data model
plot(model.rf, main = "Error rate of random forest")
#plot the train data down model
plot(model_down.rf, main = "Error rate of random forest")
We can use the important chart to display the variables that affected the random forest, from greatest impact to least impact, from top to bottom.
#For train data
varImpPlot(model.rf, pch = 20, main = "Importance of Variables")
#For train data down
varImpPlot(model_down.rf, pch = 20, main = "Importance of Variables")
Now we will evaluate the performance of our random forest models by using confusion matrix. First, we need to use our models to make prediction.
#For train data
pred_Test_rd = predict(model.rf, test_data[,setdiff(names(test_data),"is_canceled")],
type="response",
norm.votes=TRUE
)
#For train data down
pred_Test_rd_down = predict(model_down.rf, test_data[,setdiff(names(test_data),"is_canceled")],
type="response",
norm.votes=TRUE
)
Then, we will need to use confusion matrix to evaluate the performance our random forest models by finding the F1 Score and Accuracy.
#For train data
tab_Test = table(actual=test_data$is_canceled, predicted=pred_Test_rd);
confusionMatrix(tab_Test, mode = "everything", positive = "1")
## Confusion Matrix and Statistics
##
## predicted
## actual 0 1
## 0 18220 534
## 1 4966 2204
##
## Accuracy : 0.7878
## 95% CI : (0.7828, 0.7928)
## No Information Rate : 0.8944
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3447
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.80497
## Specificity : 0.78582
## Pos Pred Value : 0.30739
## Neg Pred Value : 0.97153
## Precision : 0.30739
## Recall : 0.80497
## F1 : 0.44489
## Prevalence : 0.10562
## Detection Rate : 0.08502
## Detection Prevalence : 0.27658
## Balanced Accuracy : 0.79539
##
## 'Positive' Class : 1
##
#For train data down
tab_Test_down = table(actual=test_data$is_canceled, predicted=pred_Test_rd_down);
confusionMatrix(tab_Test_down,mode="everything", positive = "1")
## Confusion Matrix and Statistics
##
## predicted
## actual 0 1
## 0 17090 1664
## 1 3495 3675
##
## Accuracy : 0.801
## 95% CI : (0.7961, 0.8058)
## No Information Rate : 0.7941
## P-Value [Acc > NIR] : 0.002829
##
## Kappa : 0.4601
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6883
## Specificity : 0.8302
## Pos Pred Value : 0.5126
## Neg Pred Value : 0.9113
## Precision : 0.5126
## Recall : 0.6883
## F1 : 0.5876
## Prevalence : 0.2059
## Detection Rate : 0.1418
## Detection Prevalence : 0.2766
## Balanced Accuracy : 0.7593
##
## 'Positive' Class : 1
##
Let us make a table to compare both random forest model
library(knitr)
DF = data.frame(Model <- c("Random Forest with Train Data", "Random Forest with Train Data Down"), Accuracy <- c(0.7878,0.801), Sensitivity <- c(0.80497,0.6883), Specificity <- c(0.78582,0.8302),Precision <- c(0.30739,0.5126), Recall <- c(0.80497,0.6883), F1 <- c(0.44489,0.5876))
kable(DF, col.names = c("Model","Accuracy","Sensitivity","Specificity","Precision","Recall","F1 Score"))
| Model | Accuracy | Sensitivity | Specificity | Precision | Recall | F1 Score |
|---|---|---|---|---|---|---|
| Random Forest with Train Data | 0.7878 | 0.80497 | 0.78582 | 0.30739 | 0.80497 | 0.44489 |
| Random Forest with Train Data Down | 0.8010 | 0.68830 | 0.83020 | 0.51260 | 0.68830 | 0.58760 |
Random Forest Model with Train Data Down has higher precision of 0.5 which means it can predict the cancellation of the hotel room correctly 50% of the time while Random Forest Model with Train Data only has precision of 0.30739. Random Forest Model with Train Data Down has lower recall of 0.6883 compare to Random Forest Model with Train Data of 0.80497, which means it has lower percentage of actual cancellation of the hotel room that were correctly classified. But Random Forest Model with Train Data Down has higher accuracy and F1 score compared to Random Forest with Train Data. Overall, Random Forest Model with Train Data Down has better performance compared to Random Forest Model with Train Data. Downsampling has improved the performance.
# Convert some of the numeric variables into factors
df$is_canceled <- as.factor(df$is_canceled)
df$arrival_date_year <- as.factor(df$arrival_date_year)
# Rearrange order of the levels of the arrival_date_month factor so they are in chronological order when plotting them
df$arrival_date_month <- factor(df$arrival_date_month, levels=c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))
# remove unused Resort data
df_resort <- subset(df, hotel=="Resort Hotel", select= -c(hotel,country
,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,reservation_status_date))
dim(df_resort)
## [1] 33596 17
# Convert the arrival_date_year, arrival_date_month and arrival_date_day_of_month variables into one column to be used as a date
library(lubridate) # For formatting dates
df_resort$arrival_date <- with(df_resort, paste(arrival_date_year, arrival_date_month, arrival_date_day_of_month, sep="-"))
df_resort$arrival_date_formatted <- parse_date_time(df_resort$arrival_date, orders = "ymd")
head(df_resort[ , c("arrival_date_year", "arrival_date_month", "arrival_date_day_of_month", "arrival_date", "arrival_date_formatted")]) #Check
## arrival_date_year arrival_date_month arrival_date_day_of_month arrival_date
## 3 2015 July 1 2015-July-1
## 4 2015 July 1 2015-July-1
## 5 2015 July 1 2015-July-1
## 6 2015 July 1 2015-July-1
## 7 2015 July 1 2015-July-1
## 8 2015 July 1 2015-July-1
## arrival_date_formatted
## 3 2015-07-01
## 4 2015-07-01
## 5 2015-07-01
## 6 2015-07-01
## 7 2015-07-01
## 8 2015-07-01
# Descriptive stats
summary(df_resort)
## is_canceled lead_time arrival_date_year arrival_date_month
## 0:25632 Min. : 0.00 2015: 6662 August : 4633
## 1: 7964 1st Qu.: 8.00 2016:15418 July : 4274
## Median : 47.00 2017:11516 May : 2907
## Mean : 83.85 April : 2808
## 3rd Qu.:139.00 June : 2735
## Max. :709.00 October: 2684
## (Other):13555
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## Min. : 1.00 Min. : 1.00 Min. : 0.000
## 1st Qu.:16.00 1st Qu.: 8.00 1st Qu.: 0.000
## Median :28.00 Median :16.00 Median : 1.000
## Mean :27.13 Mean :15.91 Mean : 1.228
## 3rd Qu.:37.00 3rd Qu.:24.00 3rd Qu.: 2.000
## Max. :53.00 Max. :31.00 Max. :19.000
##
## stays_in_week_nights adults children babies
## Min. : 0.000 Min. : 1.000 Min. : 0.00 Min. :0.00000
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 0.00 1st Qu.:0.00000
## Median : 3.000 Median : 2.000 Median : 0.00 Median :0.00000
## Mean : 3.211 Mean : 1.877 Mean : 0.15 Mean :0.01643
## 3rd Qu.: 5.000 3rd Qu.: 2.000 3rd Qu.: 0.00 3rd Qu.:0.00000
## Max. :50.000 Max. :55.000 Max. :10.00 Max. :2.00000
##
## meal market_segment reserved_room_type customer_type
## Length:33596 Length:33596 Length:33596 Length:33596
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## adr total_of_special_requests arrival_date
## Min. : -6.38 Min. :0.0000 Length:33596
## 1st Qu.: 52.34 1st Qu.:0.0000 Class :character
## Median : 80.00 Median :0.0000 Mode :character
## Mean :100.12 Mean :0.6811
## 3rd Qu.:135.00 3rd Qu.:1.0000
## Max. :508.00 Max. :5.0000
##
## arrival_date_formatted
## Min. :2015-07-01 00:00:00.00
## 1st Qu.:2016-02-25 00:00:00.00
## Median :2016-08-28 00:00:00.00
## Mean :2016-08-23 23:56:34.25
## 3rd Qu.:2017-03-16 00:00:00.00
## Max. :2017-08-31 00:00:00.00
##
From these summary statistics, we can see that:
About 23% of reservations are cancelled (7964/(7964+25632)=0.23) The average lead time (number of days between booking and arrival) is 83.85 (median 47). That is, people book their rooms about 1-2 months before their stay Most arrivals occur in July and August (most arrivals during week number 28) Most arrivals occur in the middle of the month (median 16th) Average number of weekend nights is 1.228(median 1) Average number of week nights is 3.211 (median 3) Median number of adults per stay is 2 (mean 1.877) Median number of children per stay is 0 (mean 0.15) Median number of babies is 0 (mean 0.01643) The majority of bookings have a BB (Bed & Breakfast) meal The majority of bookings were Direct followed by Offline TA (Travel Agent) /TO (Tour Operator) (market_segment) The most popular reserved_room_type was A, followed by E and D. All room types (A-C) were represented in the bookings The majority of customers were Transient (customer_type) (not part of a group or contract, and not associated with another transient booking) The average daily rate was 100.12 (median 80.00) The median number of special requests made by the customer was 0 (mean 0.6811)
library(ggplot2)
ggplot(df_resort, aes(x=adr)) +
geom_histogram(aes(y=..density..), binwidth=5, colour="black", fill="lightgray") +
geom_density(alpha=.1, fill="#FF6666") +
theme_light() +
ggtitle("Histogram of average daily rate (adr)") +
xlab("average daily rate (adr)")
so there’s a good range of values to predict.Based on the descriptive summary above, it is of note that many of the variables don’t have much variability (e.g., adults, children, babies, meal, market_segment, customer_type, total_of_special requests). The variables that have more variability and are likely to affect the average daily rate are: is_canceled, lead_time, arrival_date_month, arrival_date_week_number, arrival_date_day_of_month, reserved_room_type.
Explore associations among the different variables and adr.
library(PerformanceAnalytics)
# Plot correlations among numeric variables
num_df_resort <- select_if(df_resort, is.numeric) #Pick numeric variables
# dim(num_df_resort) #10 numeric variables
# Plot correlations and histograms using PerformanceAnalytics library
chart.Correlation(num_df_resort, histogram=TRUE, pch=19)
between adr and all the other numeric variables, we can see that there
is a strong nonlinear trend between adr and arrival_date_week_number
where adr is much higher at medium values of arrival_date_week_number
(June -August). There is a significant positive correlation between adr
and adults (r=0.20), children (r=0.36), and total_of_special_requests
(r=0.16). There is no significant correlation between adr and lead_time,
arrival_date_day_of_month, stays_in_weekend_nights,
stays_in_week_nights, and babies.
# Number of adults, number of children, total of special requests
ggplot(df_resort, aes(x=adults, y=adr)) + geom_point() + theme_light()
ggplot(df_resort, aes(x=children, y=adr)) + geom_point() + theme_light()
ggplot(df_resort, aes(x=total_of_special_requests, y=adr)) + geom_point() + theme_light()
# Explore arrival date week number trends
df_resort %>% group_by(arrival_date_week_number) %>% summarise(mean_adr=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean_adr))
## # A tibble: 53 × 2
## arrival_date_week_number mean_adr
## <int> <dbl>
## 1 33 197.
## 2 34 196.
## 3 32 192.
## 4 31 181.
## 5 35 174.
## 6 30 173.
## 7 29 161.
## 8 28 145.
## 9 36 130.
## 10 27 126.
## # … with 43 more rows
week <- ggplot(df_resort, aes(x=arrival_date_week_number, y=adr)) +
geom_point() +
geom_smooth() +
theme_light() +
ggtitle("Average daily rate vs. Arrival date week number") +
xlab("Arrival date week number") +
ylab("Average daily rate (adr)")
# Explore monthly trends
df_resort %>% group_by(arrival_date_month) %>% summarise(mean_adr=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean_adr))
## # A tibble: 12 × 2
## arrival_date_month mean_adr
## <fct> <dbl>
## 1 August 189.
## 2 July 158.
## 3 June 113.
## 4 September 101.
## 5 May 81.5
## 6 April 79.8
## 7 December 67.1
## 8 October 64.7
## 9 March 58.2
## 10 February 54.6
## 11 January 49.9
## 12 November 49.6
month <- ggplot(df_resort, aes(x=arrival_date_month, y=adr)) +
geom_boxplot() +
theme_light() +
ggtitle("Average daily rate vs. Arrival month") +
xlab("Arrival month") +
ylab("Average daily rate (adr)")
# Explore yearly trends
df_resort %>% group_by(arrival_date_year) %>% summarise(mean_adr=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean_adr))
## # A tibble: 3 × 2
## arrival_date_year mean_adr
## <fct> <dbl>
## 1 2017 114.
## 2 2015 95.9
## 3 2016 91.9
library(gridExtra)
year <- ggplot(df_resort, aes(x=arrival_date_year, y=adr)) +
geom_boxplot() +
theme_light() +
ggtitle("Average daily rate vs. Arrival date year") +
xlab("Arrival date year") +
ylab("Average daily rate (adr)")
# Plot week, monthly, and yearly trends on the same display
grid.arrange(week, month, year, nrow=3, ncol=1)
Noteworthy from the plots above are that there is a strong nonlinear
trend between week number and adr; adr increases from January til
August, and then decreases from August to December.
# Explore full arrival date trends
ggplot(df_resort, aes(x=arrival_date_formatted, y=adr)) +
geom_point() +
theme_light() +
ggtitle("Average daily rate vs. Arrival date") +
xlab("Arrival date") +
ylab("Average daily rate (adr)")
# Plot arrival_date_day_of_month for each month separately
df_resort_adr_by_month_day <- df_resort %>% group_by(arrival_date_month, arrival_date_day_of_month) %>% summarise(mean_adr = mean(adr)) %>% arrange(arrival_date_month, arrival_date_day_of_month)
## `summarise()` has grouped output by 'arrival_date_month'. You can override
## using the `.groups` argument.
ggplot(data = df_resort_adr_by_month_day, aes(x=arrival_date_day_of_month, y=mean_adr, colour=arrival_date_month)) +
geom_point() +
geom_smooth(method='lm', se=FALSE) +
facet_wrap( ~ arrival_date_month) +
ggtitle("Mean average daily rate (adr) per day of month by month") +
xlab("Day of month") + ylab("Mean average daily rate (adr)") +
scale_color_brewer(palette = "Paired") +
theme_light() +
theme(legend.position = "none") +
theme(strip.background = element_rect(color="black", size=1)) + #Facet label rectangle contour
theme(strip.text.x = element_text(color = "black")) #Facet label font colour
## `geom_smooth()` using formula 'y ~ x'
Within each month, prices remain relatively the same across days except
for June-July and December when later days are more expensive than
earlier days.
let’s explore some of the categorical variables
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
# Explore categorical variables
cat_df_resort <- select_if(df_resort, negate(is.numeric)) #Pick non-numeric variables
dim(cat_df_resort) #9 variables are not numeric
## [1] 33596 9
# Explore cancelled reservations
df_resort %>% group_by(is_canceled) %>% summarise(mean=mean(adr, na.rm=TRUE)) %>% arrange(desc(mean))
## # A tibble: 2 × 2
## is_canceled mean
## <fct> <dbl>
## 1 1 119.
## 2 0 94.3
ggplot(df_resort, aes(x=is_canceled, y=adr, colour=is_canceled)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
theme_light() +
ggtitle("Average daily rate vs. Cancellation status") +
xlab("Cancellation status") +
ylab("Average daily rate (adr)") +
scale_x_discrete(labels=c("Not cancelled","Cancelled")) +
theme(legend.position = "none")
higher adr than non-cancelled reservations
# Explore market segment
df_resort %>% group_by(market_segment) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 6 × 3
## market_segment mean n
## <chr> <dbl> <int>
## 1 Online TA 116. 16456
## 2 Direct 114. 6169
## 3 Offline TA/TO 76.4 6554
## 4 Groups 64.4 2277
## 5 Corporate 52.7 1951
## 6 Complementary 3.87 189
ggplot(df_resort, aes(x=market_segment, y=adr, colour=market_segment)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
theme_light() +
ggtitle("Average daily rate vs. Market segment") +
xlab("Market segment") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
Direct reservations seem to have higher adr but also have a large range
(similar to Online TA).
# Explore meal
df_resort %>% group_by(meal) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 5 × 3
## meal mean n
## <chr> <dbl> <int>
## 1 FB 147. 348
## 2 HB 132. 6256
## 3 Undefined 107. 485
## 4 BB 92.0 26435
## 5 SC 6.21 72
ggplot(df_resort, aes(x=meal, y=adr, colour=meal)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
scale_color_brewer(palette = "RdBu") +
theme_light() +
ggtitle("Average daily rate vs. Type of Meal") +
xlab("Meal") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
Most reservations are BB, followed by HB
# Explore reserved_room_type
df_resort %>% group_by(reserved_room_type) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 9 × 3
## reserved_room_type mean n
## <chr> <dbl> <int>
## 1 H 190. 592
## 2 G 171. 1561
## 3 C 163. 893
## 4 F 135. 1056
## 5 L 125. 6
## 6 E 117. 4550
## 7 D 106. 6577
## 8 B 105. 3
## 9 A 79.8 18358
ggplot(df_resort, aes(x=reserved_room_type, y=adr, colour=reserved_room_type)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
scale_color_brewer(palette = "Dark2") +
theme_light() +
ggtitle("Average daily rate vs. Room Type") +
xlab("Reserved Room Type") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
## Warning: Removed 6 rows containing missing values (geom_point).
Most reservations are room type A (also seem to have the lowest adr)
# Explore customer_type
df_resort %>% group_by(customer_type) %>% summarise(mean=mean(adr, na.rm=TRUE), n=n()) %>% arrange(desc(mean))
## # A tibble: 4 × 3
## customer_type mean n
## <chr> <dbl> <int>
## 1 Transient 105. 27046
## 2 Transient-Party 80.5 4623
## 3 Contract 80.1 1659
## 4 Group 76.9 268
ggplot(df_resort, aes(x=customer_type, y=adr, colour=customer_type)) +
geom_boxplot() +
geom_jitter(position=position_jitter(0.2)) +
scale_color_brewer(palette = "Set2") +
theme_light() +
ggtitle("Average daily rate vs. Customer Type") +
xlab("Customer Type") +
ylab("Average daily rate (adr)") +
theme(legend.position = "none")
Most reservations are Transient
So far, we’ve seen that there is a strong nonlinear trend between adr and arrival_date_week_number / arrival_date_month. There are some positive correlations between adr and adults, children, and total_of_special_requests. Meal, market_segment, and customer_type don’t have much variability; reserved_room_type does have variability.
Next, let’s check for interactions between arrival_date_month and the other categorical variables. Categorical variables: is_canceled, market_segment, meal, reserved_room_type, customer_type
par(mfrow=c(2,1))
interaction.plot(df_resort$arrival_date_month, df_resort$is_canceled, df_resort$adr, xlab="Arrival date month", ylab="Mean of adr", legend=TRUE, trace.label="Is cancelled")
interaction.plot(df_resort$arrival_date_month, df_resort$market_segment, df_resort$adr, xlab="Market segment", ylab="Mean of adr", legend=TRUE, trace.label="Market Segment")
interaction.plot(df_resort$arrival_date_month, df_resort$meal, df_resort$adr, xlab="Meal", ylab="Mean of adr", legend=TRUE, trace.label="Meal")
interaction.plot(df_resort$arrival_date_month, df_resort$reserved_room_type, df_resort$adr, xlab="Reserved Room Type", ylab="Mean of adr", legend=TRUE, trace.label="Room Type")
interaction.plot(df_resort$arrival_date_month, df_resort$customer_type, df_resort$adr, xlab="Customer Type", ylab="Mean of adr", legend=TRUE, trace.label="Customer Type")
There are no obvious interactions between categorical variables from the visual displays.
Now, let’s run some predictive models. I will be looking at different error metrics to evaluate models. The error is basically the difference between the predicted and actual/observed value. We will be focusing on: Mean Squared Error (MSE) - take the mean of the squared errors Root Mean Squared Error (RMSE) - take the square root of the mean of the squared errors Mean Absolute Error (MAE) - take the mean of the absolute value of the errors
First, let’s subset the data set and pick the values used for training and testing
# Choose the variables that will be used for modeling (all variables except the newly created formatted arrival dates)
df_resort_short <- subset(df_resort, select=c(is_canceled, lead_time, arrival_date_year, arrival_date_month, arrival_date_week_number, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, market_segment, reserved_room_type, customer_type, total_of_special_requests, adr))
dim(df_resort_short)
## [1] 33596 17
# Pick rows to be used for training
set.seed(10)
train <- sample(1:nrow(df_resort_short), round(nrow(df_resort_short)*0.75, 0)) #Pick rows to be used for training (75% of the data; 25197 rows)
length(train)
## [1] 25197
# Values used for testing
y <- df_resort_short[-train, "adr"] # adr observed values used for evaluating model performance in test dataset
# Train model
linear.reg <- lm(adr ~ ., data=df_resort_short, subset=train)
summary(linear.reg)
##
## Call:
## lm(formula = adr ~ ., data = df_resort_short, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.782 -13.424 -0.935 12.766 304.082
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -80.260038 2.598795 -30.884 < 2e-16 ***
## is_canceled1 2.222281 0.425410 5.224 1.77e-07 ***
## lead_time -0.112102 0.002406 -46.602 < 2e-16 ***
## arrival_date_year2016 22.307764 0.523904 42.580 < 2e-16 ***
## arrival_date_year2017 32.394966 0.664515 48.750 < 2e-16 ***
## arrival_date_monthFebruary 23.557612 2.717306 8.669 < 2e-16 ***
## arrival_date_monthMarch 48.540573 5.036693 9.637 < 2e-16 ***
## arrival_date_monthApril 85.009791 7.570384 11.229 < 2e-16 ***
## arrival_date_monthMay 112.337640 10.101769 11.121 < 2e-16 ***
## arrival_date_monthJune 162.039750 12.631470 12.828 < 2e-16 ***
## arrival_date_monthJuly 223.395818 15.136741 14.759 < 2e-16 ***
## arrival_date_monthAugust 271.151417 17.723805 15.299 < 2e-16 ***
## arrival_date_monthSeptember 226.672597 20.306151 11.163 < 2e-16 ***
## arrival_date_monthOctober 204.999538 22.798865 8.992 < 2e-16 ***
## arrival_date_monthNovember 204.840023 25.352664 8.080 6.79e-16 ***
## arrival_date_monthDecember 235.820056 27.830566 8.473 < 2e-16 ***
## arrival_date_week_number -4.243435 0.582267 -7.288 3.24e-13 ***
## arrival_date_day_of_month 0.807103 0.085221 9.471 < 2e-16 ***
## stays_in_weekend_nights -1.665002 0.215581 -7.723 1.18e-14 ***
## stays_in_week_nights 0.248157 0.103783 2.391 0.01680 *
## adults 3.700115 0.243307 15.208 < 2e-16 ***
## children 10.442744 0.528167 19.772 < 2e-16 ***
## babies -2.177810 1.310274 -1.662 0.09650 .
## mealFB 44.341629 1.683466 26.339 < 2e-16 ***
## mealHB 28.690495 0.452387 63.420 < 2e-16 ***
## mealSC -64.864107 3.601544 -18.010 < 2e-16 ***
## mealUndefined 41.501950 1.459786 28.430 < 2e-16 ***
## market_segmentCorporate 66.043190 2.289753 28.843 < 2e-16 ***
## market_segmentDirect 82.887394 2.210459 37.498 < 2e-16 ***
## market_segmentGroups 73.123677 2.344745 31.186 < 2e-16 ***
## market_segmentOffline TA/TO 58.406229 2.233712 26.148 < 2e-16 ***
## market_segmentOnline TA 82.470686 2.195459 37.564 < 2e-16 ***
## reserved_room_typeB 2.574345 18.755116 0.137 0.89083
## reserved_room_typeC 21.697328 1.252948 17.317 < 2e-16 ***
## reserved_room_typeD 13.500268 0.453405 29.775 < 2e-16 ***
## reserved_room_typeE 24.938094 0.521356 47.833 < 2e-16 ***
## reserved_room_typeF 32.804345 1.008086 32.541 < 2e-16 ***
## reserved_room_typeG 52.970708 1.083828 48.874 < 2e-16 ***
## reserved_room_typeH 68.172803 1.394388 48.891 < 2e-16 ***
## reserved_room_typeL -22.853664 15.320170 -1.492 0.13578
## customer_typeGroup -6.880571 2.097330 -3.281 0.00104 **
## customer_typeTransient 7.831248 0.887199 8.827 < 2e-16 ***
## customer_typeTransient-Party 10.334461 1.023457 10.098 < 2e-16 ***
## total_of_special_requests 1.367967 0.224786 6.086 1.18e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.51 on 25153 degrees of freedom
## Multiple R-squared: 0.8249, Adjusted R-squared: 0.8247
## F-statistic: 2757 on 43 and 25153 DF, p-value: < 2.2e-16
# Check for homoscedasticity
par(mfrow=c(1,1))
plot(linear.reg)
# Make predictions on the test dataset
predict.linear.reg <- predict(linear.reg, newdata=df_resort_short[-train, ])
MSE.linear.reg <- mean((predict.linear.reg - y)^2) #679.1063
RMSE.linear.reg <- sqrt(mean((predict.linear.reg - y)^2)) #26.05967
MAE.linear.reg <- mean(abs(predict.linear.reg - y)) #18.46738
library(rpart) #For regression trees
# Train model using default regression tree from rpart
tree <- rpart(adr ~ ., df_resort_short, subset=train)
# Find optimal tree (complexity parameter) using cross validation
# Train model
tree2 <- rpart(adr ~ ., df_resort_short, subset=train, cp=0.001)
# Print cp (complexity parameter) table to find optimal cp
printcp(tree2)
##
## Regression tree:
## rpart(formula = adr ~ ., data = df_resort_short, subset = train,
## cp = 0.001)
##
## Variables actually used in tree construction:
## [1] adults arrival_date_day_of_month
## [3] arrival_date_month arrival_date_week_number
## [5] arrival_date_year lead_time
## [7] market_segment meal
## [9] reserved_room_type
##
## Root node error: 100956384/25197 = 4006.7
##
## n= 25197
##
## CP nsplit rel error xerror xstd
## 1 0.4902283 0 1.00000 1.00009 0.0108113
## 2 0.0577273 1 0.50977 0.50985 0.0067594
## 3 0.0451942 2 0.45204 0.45215 0.0066022
## 4 0.0371362 3 0.40685 0.40696 0.0062182
## 5 0.0235549 4 0.36971 0.36985 0.0059590
## 6 0.0165145 5 0.34616 0.34806 0.0056883
## 7 0.0164495 6 0.32964 0.32478 0.0054913
## 8 0.0126920 7 0.31320 0.30985 0.0053160
## 9 0.0107003 8 0.30050 0.30066 0.0052072
## 10 0.0093760 9 0.28980 0.29166 0.0050932
## 11 0.0085183 10 0.28043 0.28060 0.0050320
## 12 0.0071745 11 0.27191 0.27419 0.0049636
## 13 0.0068083 12 0.26473 0.26992 0.0048271
## 14 0.0060822 13 0.25793 0.26294 0.0047059
## 15 0.0057796 14 0.25184 0.25826 0.0045575
## 16 0.0040516 15 0.24606 0.24806 0.0043304
## 17 0.0040134 16 0.24201 0.24369 0.0042813
## 18 0.0036488 17 0.23800 0.24050 0.0042049
## 19 0.0036023 18 0.23435 0.23704 0.0040806
## 20 0.0033570 19 0.23075 0.23397 0.0040365
## 21 0.0033002 20 0.22739 0.23105 0.0040227
## 22 0.0032648 21 0.22409 0.22952 0.0040128
## 23 0.0029518 22 0.22083 0.22395 0.0039563
## 24 0.0029461 23 0.21787 0.22069 0.0038622
## 25 0.0027979 24 0.21493 0.21918 0.0038415
## 26 0.0026038 25 0.21213 0.21688 0.0038178
## 27 0.0025477 26 0.20953 0.21348 0.0037925
## 28 0.0024763 27 0.20698 0.21149 0.0037922
## 29 0.0024720 28 0.20450 0.21059 0.0037927
## 30 0.0022169 29 0.20203 0.20666 0.0037774
## 31 0.0021381 30 0.19981 0.20520 0.0037827
## 32 0.0021155 31 0.19768 0.20477 0.0037777
## 33 0.0018295 32 0.19556 0.20233 0.0036913
## 34 0.0015912 33 0.19373 0.19921 0.0036742
## 35 0.0015677 34 0.19214 0.19811 0.0036872
## 36 0.0015245 35 0.19057 0.19779 0.0036869
## 37 0.0014721 36 0.18905 0.19626 0.0037350
## 38 0.0014689 37 0.18757 0.19524 0.0037189
## 39 0.0014344 38 0.18611 0.19493 0.0037183
## 40 0.0013929 39 0.18467 0.19341 0.0037043
## 41 0.0013815 40 0.18328 0.19220 0.0036907
## 42 0.0013398 41 0.18190 0.19141 0.0036842
## 43 0.0011722 42 0.18056 0.18857 0.0037358
## 44 0.0011573 43 0.17939 0.18683 0.0037289
## 45 0.0011098 44 0.17823 0.18641 0.0037463
## 46 0.0010939 45 0.17712 0.18571 0.0037436
## 47 0.0010709 46 0.17602 0.18542 0.0037414
## 48 0.0010617 47 0.17495 0.18492 0.0037380
## 49 0.0010403 48 0.17389 0.18443 0.0037345
## 50 0.0010337 49 0.17285 0.18408 0.0037329
## 51 0.0010127 50 0.17182 0.18385 0.0037319
## 52 0.0010000 51 0.17080 0.18292 0.0036944
# Find out tree with lowest xerror (cross validation error) and plot the line that's 1SD above its xerror (red line)
myCPtable <- tree2$cptable
id.min <- which.min(myCPtable[,'xerror']) #22, 22 splits
my1se.err <- myCPtable[id.min,'xerror'] + myCPtable[id.min, 'xstd'] # 0.4199426
plotcp(tree2)
abline(h=my1se.err, col="red")
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library(rpart.plot)
library(RColorBrewer)
# Pick the smallest tree below the red line, find its cp, and prune the tree with that cp value (optimal tree)
id.1se <- min(which(myCPtable[,'xerror'] < my1se.err))
CP.1se <- myCPtable[id.1se, 'CP']
tree.2.pruned <- prune.rpart(tree2, CP.1se)
# Fancy plot
fancyRpartPlot(tree.2.pruned, caption = NULL)
#### MODEL EVALUATION
# Make predictions on the test dataset
predict.tree.2.pruned <- predict(tree.2.pruned, newdata=df_resort_short[-train, ])
MSE.tree.2.pruned <- mean((predict.tree.2.pruned - y)^2) #741.5938
RMSE.tree.2.pruned <- sqrt(mean((predict.tree.2.pruned - y)^2)) # 27.23222
MAE.tree.2.pruned <- mean(abs(predict.tree.2.pruned - y)) #18.70728
# Create table with MSE, RMSE, MAE for each model
results <- data.frame(Model = c( "Linear regression", "DECISION tree"),
MSE = c(MSE.linear.reg, MSE.tree.2.pruned),
RMSE = c(RMSE.linear.reg, RMSE.tree.2.pruned),
MAE = c(MAE.linear.reg, MAE.tree.2.pruned))
#results
library(knitr)
kable(results, format="pandoc", digits=2, caption = "Summary results for each model performance") #Nicer table using kable
| Model | MSE | RMSE | MAE |
|---|---|---|---|
| Linear regression | 679.11 | 26.06 | 18.47 |
| DECISION tree | 741.59 | 27.23 | 18.71 |
# All predictions
predictions <- data.frame(actual = y,linear.reg = predict.linear.reg, tree.2.pruned = predict.tree.2.pruned)
# Transform from wide to long for ggplot2
library(reshape2) #For reshaping data
predictions_long <- melt(predictions,
id.vars="actual",
measure.vars=c("linear.reg", "tree.2.pruned"),
variable.name="model",
value.name="prediction")
# Plot actual vs. predicted values for each model
# Create new facet label names for each model to be used for plotting
model.labs <- c( "Linear regression", "Decision tree")
names(model.labs) <- c("linear.reg", "tree.2.pruned")
ggplot(data = predictions_long, aes(x=actual, y=prediction, colour=model)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
facet_wrap( ~ model, labeller = labeller(model = model.labs)) +
ggtitle("Predicted vs. Actual values for each model") +
xlab("Actual values") + ylab("Predicted values") +
scale_colour_viridis_d(option="plasma") +
theme_light() +
theme(legend.position = "none") +
theme(strip.background = element_rect(color="black", size=1)) + #Facet label rectangle contour
theme(strip.text.x = element_text(color = "black")) #Facet label font colour
The summary table indicates that the best performing model was the
linear regression.