Dataset

Hotel Booking Demand

Research question

Load libraries

library(tidyverse)
library(Hmisc)
library(corrplot)
library(pscl)
library(rpart)
library(rpart.plot)
library(rattle)
library(randomForest)
library(caret)
library(pROC)
library(xgboost)

Import data from part 1

dfm=read.csv("df3a.csv",header=TRUE)

Summary

dim(dfm)
## [1] 103938     45
dfm = dfm %>% mutate_at (vars(is_canceled,arrival_date_year,is_repeated_guest, reserved_room_type,assigned_room_type, room_type_change, is_family, is_waitlisted, agent,company, has_pcancel, has_pbnc, has_bchanges, has_srequests, from_company, from_agent, req_parking), list(factor))

Check correlation

dfnum = dplyr:: select_if(dfm, is.numeric)
dfnum = data.frame(lapply(dfnum, function(x) as.numeric(as.character(x))))
res=cor(dfnum)
corrplot(res, method="color", type="upper", tl.col="black" )

Select variables

fs1 = dfm %>% select(is_canceled, hotel, lead_time, arrival_date_month, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, meal, country, market_segment, distribution_channel, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, reserved_room_type, booking_changes, deposit_type, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests, is_family, room_type_change, from_company, from_agent)
dim(fs1)
## [1] 103938     26
str(fs1)
## 'data.frame':    103938 obs. of  26 variables:
##  $ is_canceled                   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 2 1 ...
##  $ hotel                         : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lead_time                     : int  7 13 14 14 0 9 85 75 23 35 ...
##  $ arrival_date_month            : Factor w/ 12 levels "April","August",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ arrival_date_day_of_month     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : int  1 1 2 2 2 2 3 3 4 4 ...
##  $ meal                          : Factor w/ 5 levels "BB","FB","HB",..: 1 1 1 1 1 2 1 3 1 3 ...
##  $ country                       : Factor w/ 12 levels "BEL","BRA","CHE",..: 7 7 7 7 11 11 11 11 11 11 ...
##  $ market_segment                : Factor w/ 7 levels "Aviation","Complementary",..: 4 3 7 7 4 4 7 6 7 7 ...
##  $ distribution_channel          : Factor w/ 4 levels "Corporate","Direct",..: 2 1 4 4 2 2 4 4 4 4 ...
##  $ is_repeated_guest             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : Factor w/ 8 levels "A","B","C","D",..: 1 1 1 1 3 3 1 4 5 4 ...
##  $ booking_changes               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ days_in_waiting_list          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : Factor w/ 4 levels "Contract","Group",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ adr                           : num  75 75 98 98 107 ...
##  $ required_car_parking_spaces   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : int  0 0 1 1 0 1 1 0 0 0 ...
##  $ is_family                     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ room_type_change              : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ from_company                  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ from_agent                    : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 2 2 2 2 ...

Spilt data

dim(fs1)
## [1] 103938     26
set.seed(123)
y2=sample(1:103938,83150)
xtrain=fs1[y2,]
xtest=fs1[-y2,]
Hmisc::describe(dfm$is_canceled)
## dfm$is_canceled 
##        n  missing distinct 
##   103938        0        2 
##                       
## Value          0     1
## Frequency  63880 40058
## Proportion 0.615 0.385
Hmisc::describe(xtrain$is_canceled)
## xtrain$is_canceled 
##        n  missing distinct 
##    83150        0        2 
##                       
## Value          0     1
## Frequency  51079 32071
## Proportion 0.614 0.386
Hmisc::describe(xtest$is_canceled)
## xtest$is_canceled 
##        n  missing distinct 
##    20788        0        2 
##                       
## Value          0     1
## Frequency  12801  7987
## Proportion 0.616 0.384

Random forest

set.seed(4543)
rf <- randomForest(is_canceled ~ ., data=xtrain)
importance(rf)
##                                MeanDecreaseGini
## hotel                                 381.16416
## lead_time                            4568.86290
## arrival_date_month                   2039.93093
## arrival_date_day_of_month            1786.74024
## stays_in_weekend_nights               689.00927
## stays_in_week_nights                 1086.93570
## meal                                  497.47115
## country                              4353.03303
## market_segment                       2280.70109
## distribution_channel                  506.23066
## is_repeated_guest                      75.14781
## previous_cancellations               1308.33939
## previous_bookings_not_canceled        207.19293
## reserved_room_type                    685.69912
## booking_changes                       781.12026
## deposit_type                         5133.88929
## days_in_waiting_list                   81.85541
## customer_type                        1004.16212
## adr                                  2610.76609
## required_car_parking_spaces           901.40084
## total_of_special_requests            2016.48590
## is_family                             163.64605
## room_type_change                     1310.72399
## from_company                           84.49535
## from_agent                            181.58979
varUsed(rf, by.tree=FALSE, count =TRUE)
##  [1]  78557 484007 353493 445951 216844 298838 118746 212279  71894  37671
## [11]  11668  13664  20095 183492  90153   6128   8234  58890 492046  23004
## [21] 128745  62898  26988  16132  32448
varImpPlot(rf)

Prediction

rfp = predict(rf, xtest)
cmrf = confusionMatrix(rfp, xtest$is_canceled)
cmrf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 11903  1406
##          1   898  6581
##                                           
##                Accuracy : 0.8892          
##                  95% CI : (0.8848, 0.8934)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7629          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9298          
##             Specificity : 0.8240          
##          Pos Pred Value : 0.8944          
##          Neg Pred Value : 0.8799          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5726          
##    Detection Prevalence : 0.6402          
##       Balanced Accuracy : 0.8769          
##                                           
##        'Positive' Class : 0               
## 
xtest$rfp= rfp
roc_rf= roc(response= xtest$is_canceled, predictor = factor(xtest$rfp, ordered=TRUE), plot=TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

round(cmrf$byClass["F1"], 4)
##     F1 
## 0.9118

Decision tree

mt = rpart(is_canceled ~., data = xtrain, method = "class")
fancyRpartPlot(mt)

printcp(mt)
## 
## Classification tree:
## rpart(formula = is_canceled ~ ., data = xtrain, method = "class")
## 
## Variables actually used in tree construction:
## [1] booking_changes             country                    
## [3] deposit_type                lead_time                  
## [5] market_segment              previous_cancellations     
## [7] required_car_parking_spaces room_type_change           
## [9] total_of_special_requests  
## 
## Root node error: 32071/83150 = 0.3857
## 
## n= 83150 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.358361      0   1.00000 1.00000 0.0043766
## 2 0.024570      1   0.64164 0.64164 0.0038801
## 3 0.014915      5   0.54052 0.53325 0.0036342
## 4 0.013127      8   0.49577 0.50762 0.0035678
## 5 0.010000     10   0.46952 0.48396 0.0035033
mt$variable.importance
##                   deposit_type                      lead_time 
##                   1.001045e+04                   2.545328e+03 
##                 market_segment                        country 
##                   1.979184e+03                   1.386944e+03 
##               room_type_change         previous_cancellations 
##                   1.306250e+03                   1.213458e+03 
##      total_of_special_requests           distribution_channel 
##                   1.038079e+03                   9.267384e+02 
##                     from_agent    required_car_parking_spaces 
##                   7.037041e+02                   5.924014e+02 
##                  customer_type           days_in_waiting_list 
##                   4.841881e+02                   4.390172e+02 
##                booking_changes                            adr 
##                   3.089387e+02                   2.220925e+02 
##                   from_company previous_bookings_not_canceled 
##                   2.018550e+02                   1.665603e+02 
##                           meal              is_repeated_guest 
##                   1.494789e+02                   5.143880e+01 
##             reserved_room_type        stays_in_weekend_nights 
##                   8.283920e+00                   6.436755e-01 
##           stays_in_week_nights 
##                   5.157373e-01

Variable importance viz

d1 = data.frame(imp = mt$variable.importance)
d2 <- d1 %>% 
  tibble::rownames_to_column() %>% 
  dplyr::rename("variable" = rowname) %>% 
  dplyr::arrange(imp) %>%
  dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(d2) +
  geom_col(aes(x = variable, y = imp),
           col = "black", show.legend = F) +
  coord_flip() +
  scale_fill_grey() +
  theme_minimal()

Prediction

tree.p = predict(mt, xtest, type = "class")
cmdt = confusionMatrix(tree.p, xtest$is_canceled)
cmdt
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10758  1802
##          1  2043  6185
##                                           
##                Accuracy : 0.815           
##                  95% CI : (0.8097, 0.8203)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6113          
##                                           
##  Mcnemar's Test P-Value : 0.0001086       
##                                           
##             Sensitivity : 0.8404          
##             Specificity : 0.7744          
##          Pos Pred Value : 0.8565          
##          Neg Pred Value : 0.7517          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5175          
##    Detection Prevalence : 0.6042          
##       Balanced Accuracy : 0.8074          
##                                           
##        'Positive' Class : 0               
## 
xtest$tp1= tree.p
roc_t1= roc(response= xtest$is_canceled, predictor = factor(xtest$tp1, ordered=TRUE), plot=TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

round(cmdt$byClass["F1"], 4)
##     F1 
## 0.8484

Logistic regression

model1= glm(is_canceled ~., data=xtrain, family = "binomial")
summary(model1) 
## 
## Call:
## glm(formula = is_canceled ~ ., family = "binomial", data = xtrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.1528  -0.6434  -0.2137   0.1980   5.3385  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -2.761e+00  2.566e-01 -10.761  < 2e-16 ***
## hotelResort Hotel              -3.907e-02  2.745e-02  -1.423 0.154619    
## lead_time                       5.950e-03  1.344e-04  44.259  < 2e-16 ***
## arrival_date_monthAugust       -3.863e-01  4.506e-02  -8.572  < 2e-16 ***
## arrival_date_monthDecember     -5.141e-02  5.370e-02  -0.957 0.338404    
## arrival_date_monthFebruary      9.717e-02  5.098e-02   1.906 0.056671 .  
## arrival_date_monthJanuary      -2.104e-01  5.975e-02  -3.521 0.000430 ***
## arrival_date_monthJuly         -4.622e-01  4.501e-02 -10.268  < 2e-16 ***
## arrival_date_monthJune         -2.460e-01  4.690e-02  -5.245 1.56e-07 ***
## arrival_date_monthMarch        -1.752e-01  4.879e-02  -3.592 0.000328 ***
## arrival_date_monthMay          -1.168e-01  4.536e-02  -2.576 0.009997 ** 
## arrival_date_monthNovember     -3.201e-02  5.592e-02  -0.572 0.567008    
## arrival_date_monthOctober      -1.436e-01  4.777e-02  -3.007 0.002639 ** 
## arrival_date_monthSeptember    -3.172e-01  4.929e-02  -6.435 1.24e-10 ***
## arrival_date_day_of_month      -5.619e-04  1.135e-03  -0.495 0.620472    
## stays_in_weekend_nights         6.305e-02  1.174e-02   5.370 7.89e-08 ***
## stays_in_week_nights            5.997e-02  6.451e-03   9.295  < 2e-16 ***
## mealFB                          6.908e-02  1.329e-01   0.520 0.603244    
## mealHB                         -2.975e-01  3.487e-02  -8.531  < 2e-16 ***
## mealSC                          1.885e-01  3.473e-02   5.428 5.69e-08 ***
## mealUndefined                  -7.637e-01  1.164e-01  -6.558 5.44e-11 ***
## countryBRA                      9.946e-01  8.685e-02  11.452  < 2e-16 ***
## countryCHE                      1.701e-01  9.518e-02   1.787 0.073936 .  
## countryDEU                     -2.430e-01  7.541e-02  -3.223 0.001268 ** 
## countryESP                      8.464e-01  7.358e-02  11.503  < 2e-16 ***
## countryFRA                      1.272e-01  7.192e-02   1.769 0.076851 .  
## countryGBR                      6.499e-02  7.142e-02   0.910 0.362825    
## countryIRL                      2.306e-01  8.274e-02   2.787 0.005326 ** 
## countryITA                      9.748e-01  7.846e-02  12.424  < 2e-16 ***
## countryNLD                     -1.377e-02  9.432e-02  -0.146 0.883923    
## countryPRT                      2.028e+00  6.849e-02  29.616  < 2e-16 ***
## countryUSA                      4.858e-01  9.078e-02   5.352 8.71e-08 ***
## market_segmentComplementary     2.399e-02  2.879e-01   0.083 0.933598    
## market_segmentCorporate        -4.846e-01  2.276e-01  -2.129 0.033271 *  
## market_segmentDirect           -4.871e-01  2.560e-01  -1.903 0.057066 .  
## market_segmentGroups           -1.355e-01  2.430e-01  -0.558 0.577158    
## market_segmentOffline TA/TO    -6.826e-01  2.436e-01  -2.802 0.005077 ** 
## market_segmentOnline TA         7.739e-01  2.434e-01   3.179 0.001478 ** 
## distribution_channelDirect     -8.245e-01  1.199e-01  -6.879 6.04e-12 ***
## distribution_channelGDS        -1.509e+00  2.514e-01  -6.000 1.97e-09 ***
## distribution_channelTA/TO      -6.361e-01  9.538e-02  -6.669 2.57e-11 ***
## is_repeated_guest1             -8.645e-01  1.018e-01  -8.496  < 2e-16 ***
## previous_cancellations          2.196e+00  7.381e-02  29.750  < 2e-16 ***
## previous_bookings_not_canceled -3.630e-01  2.668e-02 -13.606  < 2e-16 ***
## reserved_room_typeB            -8.883e-02  1.003e-01  -0.885 0.375941    
## reserved_room_typeC            -2.373e-01  1.158e-01  -2.049 0.040420 *  
## reserved_room_typeD            -9.688e-04  2.853e-02  -0.034 0.972917    
## reserved_room_typeE             1.062e-01  4.657e-02   2.281 0.022525 *  
## reserved_room_typeF            -4.072e-01  7.142e-02  -5.702 1.19e-08 ***
## reserved_room_typeG            -3.323e-01  8.572e-02  -3.877 0.000106 ***
## reserved_room_typeH            -2.128e-01  1.507e-01  -1.412 0.157928    
## booking_changes                -3.471e-01  1.986e-02 -17.476  < 2e-16 ***
## deposit_typeNon Refund          4.612e+00  1.347e-01  34.248  < 2e-16 ***
## deposit_typeRefundable          5.005e-01  2.781e-01   1.800 0.071887 .  
## days_in_waiting_list           -3.324e-03  6.406e-04  -5.189 2.11e-07 ***
## customer_typeGroup             -3.629e-02  2.013e-01  -0.180 0.856971    
## customer_typeTransient          8.787e-01  6.800e-02  12.922  < 2e-16 ***
## customer_typeTransient-Party    2.714e-01  7.246e-02   3.746 0.000180 ***
## adr                             5.945e-03  3.177e-04  18.710  < 2e-16 ***
## required_car_parking_spaces    -1.809e+02  9.220e+05   0.000 0.999843    
## total_of_special_requests      -7.283e-01  1.497e-02 -48.658  < 2e-16 ***
## is_family1                      2.626e-01  4.396e-02   5.974 2.31e-09 ***
## room_type_change1              -1.850e+00  4.882e-02 -37.903  < 2e-16 ***
## from_company1                  -8.658e-01  9.151e-02  -9.461  < 2e-16 ***
## from_agent1                     1.428e-01  5.189e-02   2.752 0.005915 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 110887  on 83149  degrees of freedom
## Residual deviance:  61792  on 83085  degrees of freedom
## AIC: 61922
## 
## Number of Fisher Scoring iterations: 11
pR2(model1)  
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML 
## -3.089594e+04 -5.544325e+04  4.909464e+04  4.427467e-01  4.459136e-01 
##          r2CU 
##  6.054777e-01
anova(model1, test= "Chisq")

Prediction

prob=predict(model1,xtest,type="response")
prob1=rep(0,20788)
prob1[prob>0.5]=1
cmlr= confusionMatrix(as.factor(prob1),xtest$is_canceled) 
cmlr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 11618  2586
##          1  1183  5401
##                                           
##                Accuracy : 0.8187          
##                  95% CI : (0.8134, 0.8239)
##     No Information Rate : 0.6158          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6038          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9076          
##             Specificity : 0.6762          
##          Pos Pred Value : 0.8179          
##          Neg Pred Value : 0.8203          
##              Prevalence : 0.6158          
##          Detection Rate : 0.5589          
##    Detection Prevalence : 0.6833          
##       Balanced Accuracy : 0.7919          
##                                           
##        'Positive' Class : 0               
## 
roc_lr1 = roc(xtest$is_canceled, prob, plot=TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

round(cmlr$byClass["F1"], 4)
##     F1 
## 0.8604