library(tidyverse)
library(Hmisc)
library(corrplot)
library(pscl)
library(rpart)
library(rpart.plot)
library(rattle)
library(randomForest)
library(caret)
library(pROC)
library(xgboost)
dfm=read.csv("df3a.csv",header=TRUE)
dim(dfm)
## [1] 103938 45
dfm = dfm %>% mutate_at (vars(is_canceled,arrival_date_year,is_repeated_guest, reserved_room_type,assigned_room_type, room_type_change, is_family, is_waitlisted, agent,company, has_pcancel, has_pbnc, has_bchanges, has_srequests, from_company, from_agent, req_parking), list(factor))
dfnum = dplyr:: select_if(dfm, is.numeric)
dfnum = data.frame(lapply(dfnum, function(x) as.numeric(as.character(x))))
res=cor(dfnum)
corrplot(res, method="color", type="upper", tl.col="black" )
fs1 = dfm %>% select(is_canceled, hotel, lead_time, arrival_date_month, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, meal, country, market_segment, distribution_channel, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, reserved_room_type, booking_changes, deposit_type, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests, is_family, room_type_change, from_company, from_agent)
dim(fs1)
## [1] 103938 26
str(fs1)
## 'data.frame': 103938 obs. of 26 variables:
## $ is_canceled : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 2 1 ...
## $ hotel : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
## $ lead_time : int 7 13 14 14 0 9 85 75 23 35 ...
## $ arrival_date_month : Factor w/ 12 levels "April","August",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 1 1 2 2 2 2 3 3 4 4 ...
## $ meal : Factor w/ 5 levels "BB","FB","HB",..: 1 1 1 1 1 2 1 3 1 3 ...
## $ country : Factor w/ 12 levels "BEL","BRA","CHE",..: 7 7 7 7 11 11 11 11 11 11 ...
## $ market_segment : Factor w/ 7 levels "Aviation","Complementary",..: 4 3 7 7 4 4 7 6 7 7 ...
## $ distribution_channel : Factor w/ 4 levels "Corporate","Direct",..: 2 1 4 4 2 2 4 4 4 4 ...
## $ is_repeated_guest : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : Factor w/ 8 levels "A","B","C","D",..: 1 1 1 1 3 3 1 4 5 4 ...
## $ booking_changes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ deposit_type : Factor w/ 3 levels "No Deposit","Non Refund",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : Factor w/ 4 levels "Contract","Group",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ adr : num 75 75 98 98 107 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 1 1 0 1 1 0 0 0 ...
## $ is_family : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ room_type_change : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
## $ from_company : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ from_agent : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 2 2 2 2 ...
dim(fs1)
## [1] 103938 26
set.seed(123)
y2=sample(1:103938,83150)
xtrain=fs1[y2,]
xtest=fs1[-y2,]
Hmisc::describe(dfm$is_canceled)
## dfm$is_canceled
## n missing distinct
## 103938 0 2
##
## Value 0 1
## Frequency 63880 40058
## Proportion 0.615 0.385
Hmisc::describe(xtrain$is_canceled)
## xtrain$is_canceled
## n missing distinct
## 83150 0 2
##
## Value 0 1
## Frequency 51079 32071
## Proportion 0.614 0.386
Hmisc::describe(xtest$is_canceled)
## xtest$is_canceled
## n missing distinct
## 20788 0 2
##
## Value 0 1
## Frequency 12801 7987
## Proportion 0.616 0.384
set.seed(4543)
rf <- randomForest(is_canceled ~ ., data=xtrain)
importance(rf)
## MeanDecreaseGini
## hotel 381.16416
## lead_time 4568.86290
## arrival_date_month 2039.93093
## arrival_date_day_of_month 1786.74024
## stays_in_weekend_nights 689.00927
## stays_in_week_nights 1086.93570
## meal 497.47115
## country 4353.03303
## market_segment 2280.70109
## distribution_channel 506.23066
## is_repeated_guest 75.14781
## previous_cancellations 1308.33939
## previous_bookings_not_canceled 207.19293
## reserved_room_type 685.69912
## booking_changes 781.12026
## deposit_type 5133.88929
## days_in_waiting_list 81.85541
## customer_type 1004.16212
## adr 2610.76609
## required_car_parking_spaces 901.40084
## total_of_special_requests 2016.48590
## is_family 163.64605
## room_type_change 1310.72399
## from_company 84.49535
## from_agent 181.58979
varUsed(rf, by.tree=FALSE, count =TRUE)
## [1] 78557 484007 353493 445951 216844 298838 118746 212279 71894 37671
## [11] 11668 13664 20095 183492 90153 6128 8234 58890 492046 23004
## [21] 128745 62898 26988 16132 32448
varImpPlot(rf)
Prediction
rfp = predict(rf, xtest)
cmrf = confusionMatrix(rfp, xtest$is_canceled)
cmrf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11903 1406
## 1 898 6581
##
## Accuracy : 0.8892
## 95% CI : (0.8848, 0.8934)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7629
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9298
## Specificity : 0.8240
## Pos Pred Value : 0.8944
## Neg Pred Value : 0.8799
## Prevalence : 0.6158
## Detection Rate : 0.5726
## Detection Prevalence : 0.6402
## Balanced Accuracy : 0.8769
##
## 'Positive' Class : 0
##
xtest$rfp= rfp
roc_rf= roc(response= xtest$is_canceled, predictor = factor(xtest$rfp, ordered=TRUE), plot=TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
round(cmrf$byClass["F1"], 4)
## F1
## 0.9118
mt = rpart(is_canceled ~., data = xtrain, method = "class")
fancyRpartPlot(mt)
printcp(mt)
##
## Classification tree:
## rpart(formula = is_canceled ~ ., data = xtrain, method = "class")
##
## Variables actually used in tree construction:
## [1] booking_changes country
## [3] deposit_type lead_time
## [5] market_segment previous_cancellations
## [7] required_car_parking_spaces room_type_change
## [9] total_of_special_requests
##
## Root node error: 32071/83150 = 0.3857
##
## n= 83150
##
## CP nsplit rel error xerror xstd
## 1 0.358361 0 1.00000 1.00000 0.0043766
## 2 0.024570 1 0.64164 0.64164 0.0038801
## 3 0.014915 5 0.54052 0.53325 0.0036342
## 4 0.013127 8 0.49577 0.50762 0.0035678
## 5 0.010000 10 0.46952 0.48396 0.0035033
mt$variable.importance
## deposit_type lead_time
## 1.001045e+04 2.545328e+03
## market_segment country
## 1.979184e+03 1.386944e+03
## room_type_change previous_cancellations
## 1.306250e+03 1.213458e+03
## total_of_special_requests distribution_channel
## 1.038079e+03 9.267384e+02
## from_agent required_car_parking_spaces
## 7.037041e+02 5.924014e+02
## customer_type days_in_waiting_list
## 4.841881e+02 4.390172e+02
## booking_changes adr
## 3.089387e+02 2.220925e+02
## from_company previous_bookings_not_canceled
## 2.018550e+02 1.665603e+02
## meal is_repeated_guest
## 1.494789e+02 5.143880e+01
## reserved_room_type stays_in_weekend_nights
## 8.283920e+00 6.436755e-01
## stays_in_week_nights
## 5.157373e-01
Variable importance viz
d1 = data.frame(imp = mt$variable.importance)
d2 <- d1 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(d2) +
geom_col(aes(x = variable, y = imp),
col = "black", show.legend = F) +
coord_flip() +
scale_fill_grey() +
theme_minimal()
Prediction
tree.p = predict(mt, xtest, type = "class")
cmdt = confusionMatrix(tree.p, xtest$is_canceled)
cmdt
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10758 1802
## 1 2043 6185
##
## Accuracy : 0.815
## 95% CI : (0.8097, 0.8203)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6113
##
## Mcnemar's Test P-Value : 0.0001086
##
## Sensitivity : 0.8404
## Specificity : 0.7744
## Pos Pred Value : 0.8565
## Neg Pred Value : 0.7517
## Prevalence : 0.6158
## Detection Rate : 0.5175
## Detection Prevalence : 0.6042
## Balanced Accuracy : 0.8074
##
## 'Positive' Class : 0
##
xtest$tp1= tree.p
roc_t1= roc(response= xtest$is_canceled, predictor = factor(xtest$tp1, ordered=TRUE), plot=TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
round(cmdt$byClass["F1"], 4)
## F1
## 0.8484
model1= glm(is_canceled ~., data=xtrain, family = "binomial")
summary(model1)
##
## Call:
## glm(formula = is_canceled ~ ., family = "binomial", data = xtrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.1528 -0.6434 -0.2137 0.1980 5.3385
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.761e+00 2.566e-01 -10.761 < 2e-16 ***
## hotelResort Hotel -3.907e-02 2.745e-02 -1.423 0.154619
## lead_time 5.950e-03 1.344e-04 44.259 < 2e-16 ***
## arrival_date_monthAugust -3.863e-01 4.506e-02 -8.572 < 2e-16 ***
## arrival_date_monthDecember -5.141e-02 5.370e-02 -0.957 0.338404
## arrival_date_monthFebruary 9.717e-02 5.098e-02 1.906 0.056671 .
## arrival_date_monthJanuary -2.104e-01 5.975e-02 -3.521 0.000430 ***
## arrival_date_monthJuly -4.622e-01 4.501e-02 -10.268 < 2e-16 ***
## arrival_date_monthJune -2.460e-01 4.690e-02 -5.245 1.56e-07 ***
## arrival_date_monthMarch -1.752e-01 4.879e-02 -3.592 0.000328 ***
## arrival_date_monthMay -1.168e-01 4.536e-02 -2.576 0.009997 **
## arrival_date_monthNovember -3.201e-02 5.592e-02 -0.572 0.567008
## arrival_date_monthOctober -1.436e-01 4.777e-02 -3.007 0.002639 **
## arrival_date_monthSeptember -3.172e-01 4.929e-02 -6.435 1.24e-10 ***
## arrival_date_day_of_month -5.619e-04 1.135e-03 -0.495 0.620472
## stays_in_weekend_nights 6.305e-02 1.174e-02 5.370 7.89e-08 ***
## stays_in_week_nights 5.997e-02 6.451e-03 9.295 < 2e-16 ***
## mealFB 6.908e-02 1.329e-01 0.520 0.603244
## mealHB -2.975e-01 3.487e-02 -8.531 < 2e-16 ***
## mealSC 1.885e-01 3.473e-02 5.428 5.69e-08 ***
## mealUndefined -7.637e-01 1.164e-01 -6.558 5.44e-11 ***
## countryBRA 9.946e-01 8.685e-02 11.452 < 2e-16 ***
## countryCHE 1.701e-01 9.518e-02 1.787 0.073936 .
## countryDEU -2.430e-01 7.541e-02 -3.223 0.001268 **
## countryESP 8.464e-01 7.358e-02 11.503 < 2e-16 ***
## countryFRA 1.272e-01 7.192e-02 1.769 0.076851 .
## countryGBR 6.499e-02 7.142e-02 0.910 0.362825
## countryIRL 2.306e-01 8.274e-02 2.787 0.005326 **
## countryITA 9.748e-01 7.846e-02 12.424 < 2e-16 ***
## countryNLD -1.377e-02 9.432e-02 -0.146 0.883923
## countryPRT 2.028e+00 6.849e-02 29.616 < 2e-16 ***
## countryUSA 4.858e-01 9.078e-02 5.352 8.71e-08 ***
## market_segmentComplementary 2.399e-02 2.879e-01 0.083 0.933598
## market_segmentCorporate -4.846e-01 2.276e-01 -2.129 0.033271 *
## market_segmentDirect -4.871e-01 2.560e-01 -1.903 0.057066 .
## market_segmentGroups -1.355e-01 2.430e-01 -0.558 0.577158
## market_segmentOffline TA/TO -6.826e-01 2.436e-01 -2.802 0.005077 **
## market_segmentOnline TA 7.739e-01 2.434e-01 3.179 0.001478 **
## distribution_channelDirect -8.245e-01 1.199e-01 -6.879 6.04e-12 ***
## distribution_channelGDS -1.509e+00 2.514e-01 -6.000 1.97e-09 ***
## distribution_channelTA/TO -6.361e-01 9.538e-02 -6.669 2.57e-11 ***
## is_repeated_guest1 -8.645e-01 1.018e-01 -8.496 < 2e-16 ***
## previous_cancellations 2.196e+00 7.381e-02 29.750 < 2e-16 ***
## previous_bookings_not_canceled -3.630e-01 2.668e-02 -13.606 < 2e-16 ***
## reserved_room_typeB -8.883e-02 1.003e-01 -0.885 0.375941
## reserved_room_typeC -2.373e-01 1.158e-01 -2.049 0.040420 *
## reserved_room_typeD -9.688e-04 2.853e-02 -0.034 0.972917
## reserved_room_typeE 1.062e-01 4.657e-02 2.281 0.022525 *
## reserved_room_typeF -4.072e-01 7.142e-02 -5.702 1.19e-08 ***
## reserved_room_typeG -3.323e-01 8.572e-02 -3.877 0.000106 ***
## reserved_room_typeH -2.128e-01 1.507e-01 -1.412 0.157928
## booking_changes -3.471e-01 1.986e-02 -17.476 < 2e-16 ***
## deposit_typeNon Refund 4.612e+00 1.347e-01 34.248 < 2e-16 ***
## deposit_typeRefundable 5.005e-01 2.781e-01 1.800 0.071887 .
## days_in_waiting_list -3.324e-03 6.406e-04 -5.189 2.11e-07 ***
## customer_typeGroup -3.629e-02 2.013e-01 -0.180 0.856971
## customer_typeTransient 8.787e-01 6.800e-02 12.922 < 2e-16 ***
## customer_typeTransient-Party 2.714e-01 7.246e-02 3.746 0.000180 ***
## adr 5.945e-03 3.177e-04 18.710 < 2e-16 ***
## required_car_parking_spaces -1.809e+02 9.220e+05 0.000 0.999843
## total_of_special_requests -7.283e-01 1.497e-02 -48.658 < 2e-16 ***
## is_family1 2.626e-01 4.396e-02 5.974 2.31e-09 ***
## room_type_change1 -1.850e+00 4.882e-02 -37.903 < 2e-16 ***
## from_company1 -8.658e-01 9.151e-02 -9.461 < 2e-16 ***
## from_agent1 1.428e-01 5.189e-02 2.752 0.005915 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 110887 on 83149 degrees of freedom
## Residual deviance: 61792 on 83085 degrees of freedom
## AIC: 61922
##
## Number of Fisher Scoring iterations: 11
pR2(model1)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML
## -3.089594e+04 -5.544325e+04 4.909464e+04 4.427467e-01 4.459136e-01
## r2CU
## 6.054777e-01
anova(model1, test= "Chisq")
Prediction
prob=predict(model1,xtest,type="response")
prob1=rep(0,20788)
prob1[prob>0.5]=1
cmlr= confusionMatrix(as.factor(prob1),xtest$is_canceled)
cmlr
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11618 2586
## 1 1183 5401
##
## Accuracy : 0.8187
## 95% CI : (0.8134, 0.8239)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6038
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9076
## Specificity : 0.6762
## Pos Pred Value : 0.8179
## Neg Pred Value : 0.8203
## Prevalence : 0.6158
## Detection Rate : 0.5589
## Detection Prevalence : 0.6833
## Balanced Accuracy : 0.7919
##
## 'Positive' Class : 0
##
roc_lr1 = roc(xtest$is_canceled, prob, plot=TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
round(cmlr$byClass["F1"], 4)
## F1
## 0.8604