Hotel booking demand

Objective

Pernahkah anda bertanya-tanya kapan waktu terbaik dalam setahun untuk memesan kamar hotel? pada kesempatan kali ini saya akan melakukan prediksi booking hotel demand, algoritma yang saya akan gunakan adalah logistik regression dan k-Nearest Neighbor

Library & set up

library(tidyverse)
library(gtools)
library(class)
library(lubridate)
library(caret)
library(ggplot2)
library(rsample)
library(dplyr)
library(caret)
library(lattice)

Logistics Regression

Data import

htl <- read.csv("data_input/hotel_bookings.csv")

cek struktur data

glimpse(htl)
## Rows: 119,390
## Columns: 32
## $ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Res...
## $ is_canceled                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...
## $ lead_time                      <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 7...
## $ arrival_date_year              <int> 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ arrival_date_month             <chr> "July", "July", "July", "July", "Jul...
## $ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, ...
## $ arrival_date_day_of_month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, ...
## $ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", ...
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "...
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corpo...
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corpo...
## $ is_repeated_guest              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "...
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "...
## $ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deposit_type                   <chr> "No Deposit", "No Deposit", "No Depo...
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240"...
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NUL...
## $ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ customer_type                  <chr> "Transient", "Transient", "Transient...
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out...
## $ reservation_status_date        <chr> "2015-07-01", "2015-07-01", "2015-07...

selanjutnya kita akan melihat missing value, missing value harus kita cek terlebih dahulu agar tidak mengganggu pemodelan.

colSums(is.na(htl))
##                          hotel                    is_canceled 
##                              0                              0 
##                      lead_time              arrival_date_year 
##                              0                              0 
##             arrival_date_month       arrival_date_week_number 
##                              0                              0 
##      arrival_date_day_of_month        stays_in_weekend_nights 
##                              0                              0 
##           stays_in_week_nights                         adults 
##                              0                              0 
##                       children                         babies 
##                              4                              0 
##                           meal                        country 
##                              0                              0 
##                 market_segment           distribution_channel 
##                              0                              0 
##              is_repeated_guest         previous_cancellations 
##                              0                              0 
## previous_bookings_not_canceled             reserved_room_type 
##                              0                              0 
##             assigned_room_type                booking_changes 
##                              0                              0 
##                   deposit_type                          agent 
##                              0                              0 
##                        company           days_in_waiting_list 
##                              0                              0 
##                  customer_type                            adr 
##                              0                              0 
##    required_car_parking_spaces      total_of_special_requests 
##                              0                              0 
##             reservation_status        reservation_status_date 
##                              0                              0

Terdapat nilah NA pada kolom children, karena nilai NA <5% maka diputuskan untuk menghilangkan nilai tersebut

htl <- na.omit(htl)

Data manipulation

htl <- htl %>% 
  mutate(is_canceled = as.factor(is_canceled),
         hotel = as.factor(hotel),
         deposit_type = as.factor(deposit_type),
         customer_type = as.factor(customer_type),
         customer_type = as.factor(customer_type),
         reservation_status_date = ymd(reservation_status_date))


glimpse(htl)
## Rows: 119,386
## Columns: 32
## $ hotel                          <fct> Resort Hotel, Resort Hotel, Resort H...
## $ is_canceled                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...
## $ lead_time                      <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 7...
## $ arrival_date_year              <int> 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ arrival_date_month             <chr> "July", "July", "July", "July", "Jul...
## $ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, ...
## $ arrival_date_day_of_month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, ...
## $ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", ...
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "...
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corpo...
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corpo...
## $ is_repeated_guest              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "...
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "...
## $ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deposit_type                   <fct> No Deposit, No Deposit, No Deposit, ...
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240"...
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NUL...
## $ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ customer_type                  <fct> Transient, Transient, Transient, Tra...
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out...
## $ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02,...

Pre-Processing Data

Sebelum melakukan pemodelan kita perlu melihat proporsi dari variabel target yang kita miliki pada kolom is_canceled

prop.table(table(htl$is_canceled))
## 
##         0         1 
## 0.6296048 0.3703952
table(htl$is_canceled)
## 
##     0     1 
## 75166 44220

EDA

glimpse(htl)
## Rows: 119,386
## Columns: 32
## $ hotel                          <fct> Resort Hotel, Resort Hotel, Resort H...
## $ is_canceled                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...
## $ lead_time                      <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 7...
## $ arrival_date_year              <int> 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ arrival_date_month             <chr> "July", "July", "July", "July", "Jul...
## $ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, ...
## $ arrival_date_day_of_month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, ...
## $ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", ...
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "...
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corpo...
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corpo...
## $ is_repeated_guest              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "...
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "...
## $ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deposit_type                   <fct> No Deposit, No Deposit, No Deposit, ...
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240"...
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NUL...
## $ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ customer_type                  <fct> Transient, Transient, Transient, Tra...
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out...
## $ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02,...
ggplot(data = htl, aes(x = hotel))+
  geom_bar(star = "count")+
  labs(title = "BOOKING REQUEST HOTEL TYPE",
       X = "HOTEL TYPE",
       Y = "NO. OF BOOKING")+
  scale_color_brewer(palette = "set2")

Dari graph di atas bisa kita lihat bahwa jumlah request berdasarkan tipe hotel paling banyak adalah city hotel yaitu 80000

htl_m <- as.data.frame(table(htl$arrival_date_month))

htl_m <- htl_m[order(htl_m$Freq, decreasing = T), ]


ggplot(data = htl_m, mapping =  aes(x = Freq, y = reorder(Var1, Freq)))+
  geom_col(aes(fill = Freq))+
  geom_text(mapping = aes(label = Freq), nudge_x = 1)+
  scale_fill_gradient(low = "blue", high = "red")+
  labs(title = "Month request booking hotel",
       subtitle = "most number of hotel booking",
       x = "Hotel Booking count",
       y = "",
       caption = "Source :Hotel booking demand",
       fill = "Booking count")+
  theme(legend.position = "none",
        plot.title = element_text(family = "Times New  Roman", face = "bold", colour = "red"))

Dari graph diatas bisa kita lihat bahwa bulan paling melakukan pemesanan kamar hotel adalah bulan agustus diikut oleh bulan july, salah satu faktor penyebabnya meningkatnya pemesanan hotel karena di bulan tersebut adalah terdapat libur sekolah

ggplot(htl, aes(arrival_date_month, fill = (is_canceled))) +
  geom_bar()+
  geom_text(stat = "count", aes(label = ..count..), hjust = 1)+
  coord_flip()+
  scale_fill_manual(name = "Booking status",
                    breaks = c("0", "1"),
                    labels = c("Not Cancelled", "Cancelled"),
                    values =  c("0" = "royalblue", "1"="red"))

Dari graph di atas kita bisa lihat bahwa traveller yang telah konfirmasi booking lalu melakukan cancel terjadi di bulan agustus lalu di bulan july, hal ini bisa terjadi karena beberapa faktor salah satunya adalah mengubah tujuan liburan.

ggplot(data = htl, mapping = aes(x = hotel, y = prop.table(stat(count)),
                                 fill = factor(is_canceled),
                                 label = scales::percent(prop.table(stat(count)))))+
  geom_bar(position = position_dodge()) +
  geom_text(stat = "count",
            position = position_dodge(.9),
            vjust = -0.5,
            size = 3)+
  scale_y_continuous(labels = scales::percent)+
  labs(title = "Cancellation Status by Hotel Type",
       x = "Hotel type",
       y = "count")+
  theme_classic()+
  scale_fill_manual(name = "Booking status",
                    breaks = c("0", "1"),
                    labels = c("Not Cancelled", "Cancelled"),
                    values =  c("0" = "royalblue", "1"="red"))

Berdasarkan graph di atas bisa kita lihat bahwa city hotel memiliki persentase calncelled cukup tinggi yaitu 38.7% dari 66.4% yang total melakukan pemesanan kamar city hotel, hampir setengah dari pemesanan city hotel yang melaukan cancelled, hal ini bisa terjadi karena berbagai faktor salah satunya harga yang tidak kompetitif dengan kompetitor.

Splitting Train-Test

Langkah selanjutnya adalah melakukan Splitting Train-Test, data train akan digunakan dalam proses pembuatan model data train akan digunakan dalam proses pembuatan model.

RNGkind(sample.kind = "Rounding")
set.seed(123)
intrain <- sample(nrow(htl), nrow(htl)*0.8)
htl_train <- htl[intrain, ]
htl_test <- htl[-intrain, ]

Modeling

prop.table(table(htl_train$is_canceled)) %>% round(2)
## 
##    0    1 
## 0.63 0.37
prop.table(table(htl_test$is_canceled)) %>% round(2)
## 
##    0    1 
## 0.63 0.37

Jika kita lihat Proporsi data target tidak seimbang maka akan dilakukan downsampling untuk menyeimbangkan data tujuannya adalah menyamakan jumlah observasi pada kelas mayoritas dan minoritas. Sehingga model klasifikasi dapat melalui proses learning yang seimbang.

library(recipes)
set.seed(410)
splitted_rec <- initial_split(data = htl, prop = 0.7, strata = "is_canceled")
rec <- recipe(is_canceled ~., training(splitted_rec)) %>% 
  step_downsample(is_canceled, ratio = 1, seed = 100) %>% 
  prep()



training_rec <- juice(rec)
test_rec <- bake(rec, testing(splitted_rec))

prop.table(table(training_rec$is_canceled)) %>% round(2)
## 
##   0   1 
## 0.5 0.5
glimpse(htl)
## Rows: 119,386
## Columns: 32
## $ hotel                          <fct> Resort Hotel, Resort Hotel, Resort H...
## $ is_canceled                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...
## $ lead_time                      <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 7...
## $ arrival_date_year              <int> 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ arrival_date_month             <chr> "July", "July", "July", "July", "Jul...
## $ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, ...
## $ arrival_date_day_of_month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, ...
## $ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", ...
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "...
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corpo...
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corpo...
## $ is_repeated_guest              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "...
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "...
## $ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deposit_type                   <fct> No Deposit, No Deposit, No Deposit, ...
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240"...
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NUL...
## $ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ customer_type                  <fct> Transient, Transient, Transient, Tra...
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out...
## $ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02,...

Selanjutnya kita akan melakukan pemodelan menggunakan logistik regression dengan variabel prediktor yang di anggap mempengaruhi target.

model_logit <- glm(formula = is_canceled ~lead_time +customer_type +
                      deposit_type + adr+hotel +total_of_special_requests+deposit_type+customer_type+stays_in_week_nights+stays_in_weekend_nights, family = "binomial", data = training_rec)



summary(model_logit)
## 
## Call:
## glm(formula = is_canceled ~ lead_time + customer_type + deposit_type + 
##     adr + hotel + total_of_special_requests + deposit_type + 
##     customer_type + stays_in_week_nights + stays_in_weekend_nights, 
##     family = "binomial", data = training_rec)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1546  -0.9307  -0.1015   1.0636   2.5552  
## 
## Coefficients:
##                                Estimate Std. Error z value             Pr(>|z|)
## (Intercept)                  -1.5668056  0.0632150 -24.785 < 0.0000000000000002
## lead_time                     0.0044062  0.0001088  40.484 < 0.0000000000000002
## customer_typeGroup           -0.4787355  0.1788416  -2.677              0.00743
## customer_typeTransient        0.7173600  0.0574954  12.477 < 0.0000000000000002
## customer_typeTransient-Party -0.0928939  0.0604956  -1.536              0.12465
## deposit_typeNon Refund        5.1504871  0.1555902  33.103 < 0.0000000000000002
## deposit_typeRefundable       -0.1702899  0.2548146  -0.668              0.50395
## adr                           0.0047002  0.0001973  23.824 < 0.0000000000000002
## hotelResort Hotel            -0.3595842  0.0207715 -17.311 < 0.0000000000000002
## total_of_special_requests    -0.5377867  0.0130916 -41.079 < 0.0000000000000002
## stays_in_week_nights          0.0430687  0.0058018   7.423    0.000000000000114
## stays_in_weekend_nights       0.0663862  0.0109091   6.085    0.000000001162208
##                                 
## (Intercept)                  ***
## lead_time                    ***
## customer_typeGroup           ** 
## customer_typeTransient       ***
## customer_typeTransient-Party    
## deposit_typeNon Refund       ***
## deposit_typeRefundable          
## adr                          ***
## hotelResort Hotel            ***
## total_of_special_requests    ***
## stays_in_week_nights         ***
## stays_in_weekend_nights      ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 85823  on 61907  degrees of freedom
## Residual deviance: 64968  on 61896  degrees of freedom
## AIC: 64992
## 
## Number of Fisher Scoring iterations: 8

dari hasil pemodelan di atas bisa kita lihat bahwa ada beberapa variabel yang tidak signifikan mempengaruhi terhadap target, oleh karena itu akan dilakukan model fitting menggunakan stepwise.

library(MASS)

model2 <- stepAIC(model_logit, direction = "backward")
## Start:  AIC=64992.1
## is_canceled ~ lead_time + customer_type + deposit_type + adr + 
##     hotel + total_of_special_requests + deposit_type + customer_type + 
##     stays_in_week_nights + stays_in_weekend_nights
## 
##                             Df Deviance   AIC
## <none>                            64968 64992
## - stays_in_weekend_nights    1    65005 65027
## - stays_in_week_nights       1    65023 65045
## - hotel                      1    65271 65293
## - adr                        1    65544 65566
## - customer_type              3    66133 66151
## - lead_time                  1    66658 66680
## - total_of_special_requests  1    66814 66836
## - deposit_type               2    72228 72248

Dari hasil stepwise backward kita memperoleh model sebagai berikut.

summary(model2)
## 
## Call:
## glm(formula = is_canceled ~ lead_time + customer_type + deposit_type + 
##     adr + hotel + total_of_special_requests + deposit_type + 
##     customer_type + stays_in_week_nights + stays_in_weekend_nights, 
##     family = "binomial", data = training_rec)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1546  -0.9307  -0.1015   1.0636   2.5552  
## 
## Coefficients:
##                                Estimate Std. Error z value             Pr(>|z|)
## (Intercept)                  -1.5668056  0.0632150 -24.785 < 0.0000000000000002
## lead_time                     0.0044062  0.0001088  40.484 < 0.0000000000000002
## customer_typeGroup           -0.4787355  0.1788416  -2.677              0.00743
## customer_typeTransient        0.7173600  0.0574954  12.477 < 0.0000000000000002
## customer_typeTransient-Party -0.0928939  0.0604956  -1.536              0.12465
## deposit_typeNon Refund        5.1504871  0.1555902  33.103 < 0.0000000000000002
## deposit_typeRefundable       -0.1702899  0.2548146  -0.668              0.50395
## adr                           0.0047002  0.0001973  23.824 < 0.0000000000000002
## hotelResort Hotel            -0.3595842  0.0207715 -17.311 < 0.0000000000000002
## total_of_special_requests    -0.5377867  0.0130916 -41.079 < 0.0000000000000002
## stays_in_week_nights          0.0430687  0.0058018   7.423    0.000000000000114
## stays_in_weekend_nights       0.0663862  0.0109091   6.085    0.000000001162208
##                                 
## (Intercept)                  ***
## lead_time                    ***
## customer_typeGroup           ** 
## customer_typeTransient       ***
## customer_typeTransient-Party    
## deposit_typeNon Refund       ***
## deposit_typeRefundable          
## adr                          ***
## hotelResort Hotel            ***
## total_of_special_requests    ***
## stays_in_week_nights         ***
## stays_in_weekend_nights      ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 85823  on 61907  degrees of freedom
## Residual deviance: 64968  on 61896  degrees of freedom
## AIC: 64992
## 
## Number of Fisher Scoring iterations: 8

Prediksi

htl_test$prob_htl <- predict(model2, type = "response", newdata = htl_test)
ggplot(data = htl_test, aes(x = prob_htl))+
  geom_density(lwd = 0.5)+
  labs(title = "Distribution probability density")+
  theme_minimal()

pada graph di atas kita bisa lihat bahwa kepadatan peluang < 0.5 artinya Not canceled

htl_test <- htl_test %>% 
  mutate(is_canceled = factor(is_canceled, levels = c(0,1),
                              labels = c("Not canceled", "Canceled")))


htl_test$pred_htl <- factor(ifelse(htl_test$prob_htl> 0.5, "Canceled", "Not canceled"))
htl_test[1:10, c("pred_htl", "is_canceled")]
##        pred_htl  is_canceled
## 1      Canceled Not canceled
## 2  Not canceled Not canceled
## 3  Not canceled     Canceled
## 4  Not canceled Not canceled
## 5  Not canceled Not canceled
## 6  Not canceled Not canceled
## 7  Not canceled Not canceled
## 8  Not canceled     Canceled
## 9  Not canceled Not canceled
## 10 Not canceled Not canceled

Model Evaluation

confus_logit <- confusionMatrix(htl_test$pred_htl, htl_test$is_canceled, positive = "Canceled")
  • Accuracy: persentase observasi pada new data yang berhasil ditebak
  • Sensitivity/Recall: persentase observasi kelas positif yang benar ditebak
  • Specificity: persentase observasi kelas negatif yang benar ditebak (based on actual data
  • Pos Pred Value/Precision: persentase observasi kelas positif yang benar ditebak (based on prediction), sering disebut tingkat presisi model dalam menebak kelas positif
Accuracy = (5457+12428)/23878
Recall = 5457/(5457+3424) 
Precision = 5457/(5457+2569) 
Specificity = 12428/(12428+2569) 
recall <- round(5457/(5457+3424), 2)
Accuracy <- round((5457+12428)/23878, 2)
Precision <- round(5457/(5457+2569), 2)
Specificity <- round(12428/(12428+2569), 2)


performance <- cbind.data.frame(Accuracy,recall, Precision, Specificity )
performance
##   Accuracy recall Precision Specificity
## 1     0.75   0.61      0.68        0.83

Berdasarkan hasil ConfusionMatrix diatas, maka dapat kita ambil informasi bahwa kemampuan model dalam menebak target (Canceled, not canceled) 75%, sedangkan persentase kelas canceled berdasarkan data aktual yang berhasil di tebak adalah 61%, sedangakan persentase kelas canceled berdasarkan data prediction yang berhasil di tebak adalah 68%, dan persentase kelas Not canceled berdasarkan data aktual yang berhasil di tebak adalah 83%

K-Nearest Neighbour

Pre-Processing Data

htl_new <- read.csv("data_input/hotel_bookings.csv")
glimpse(htl_new)
## Rows: 119,390
## Columns: 32
## $ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Res...
## $ is_canceled                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...
## $ lead_time                      <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 7...
## $ arrival_date_year              <int> 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ arrival_date_month             <chr> "July", "July", "July", "July", "Jul...
## $ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, ...
## $ arrival_date_day_of_month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, ...
## $ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", ...
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "...
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corpo...
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corpo...
## $ is_repeated_guest              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "...
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "...
## $ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deposit_type                   <chr> "No Deposit", "No Deposit", "No Depo...
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240"...
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NUL...
## $ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ customer_type                  <chr> "Transient", "Transient", "Transient...
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out...
## $ reservation_status_date        <chr> "2015-07-01", "2015-07-01", "2015-07...

Menghapus variabel yang tidak di representatif untuk membuat model k-NN

htl_new <- htl %>% 
  dplyr::select(-c("hotel","arrival_date_month", "meal", "country", "market_segment", "distribution_channel", "deposit_type", "company", "customer_type", "reservation_status", "reservation_status_date", "customer_type", "assigned_room_type", "reserved_room_type", "agent"))

Cek missing value

colSums(is.na(htl_new))
##                    is_canceled                      lead_time 
##                              0                              0 
##              arrival_date_year       arrival_date_week_number 
##                              0                              0 
##      arrival_date_day_of_month        stays_in_weekend_nights 
##                              0                              0 
##           stays_in_week_nights                         adults 
##                              0                              0 
##                       children                         babies 
##                              0                              0 
##              is_repeated_guest         previous_cancellations 
##                              0                              0 
## previous_bookings_not_canceled                booking_changes 
##                              0                              0 
##           days_in_waiting_list                            adr 
##                              0                              0 
##    required_car_parking_spaces      total_of_special_requests 
##                              0                              0
htl_new <- na.omit(htl_new)
summary(htl_new)
##  is_canceled   lead_time   arrival_date_year arrival_date_week_number
##  0:75166     Min.   :  0   Min.   :2015      Min.   : 1.00           
##  1:44220     1st Qu.: 18   1st Qu.:2016      1st Qu.:16.00           
##              Median : 69   Median :2016      Median :28.00           
##              Mean   :104   Mean   :2016      Mean   :27.16           
##              3rd Qu.:160   3rd Qu.:2017      3rd Qu.:38.00           
##              Max.   :737   Max.   :2017      Max.   :53.00           
##  arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights
##  Min.   : 1.0              Min.   : 0.0000         Min.   : 0.0        
##  1st Qu.: 8.0              1st Qu.: 0.0000         1st Qu.: 1.0        
##  Median :16.0              Median : 1.0000         Median : 2.0        
##  Mean   :15.8              Mean   : 0.9276         Mean   : 2.5        
##  3rd Qu.:23.0              3rd Qu.: 2.0000         3rd Qu.: 3.0        
##  Max.   :31.0              Max.   :19.0000         Max.   :50.0        
##      adults          children           babies          is_repeated_guest
##  Min.   : 0.000   Min.   : 0.0000   Min.   : 0.000000   Min.   :0.00000  
##  1st Qu.: 2.000   1st Qu.: 0.0000   1st Qu.: 0.000000   1st Qu.:0.00000  
##  Median : 2.000   Median : 0.0000   Median : 0.000000   Median :0.00000  
##  Mean   : 1.856   Mean   : 0.1039   Mean   : 0.007949   Mean   :0.03191  
##  3rd Qu.: 2.000   3rd Qu.: 0.0000   3rd Qu.: 0.000000   3rd Qu.:0.00000  
##  Max.   :55.000   Max.   :10.0000   Max.   :10.000000   Max.   :1.00000  
##  previous_cancellations previous_bookings_not_canceled booking_changes  
##  Min.   : 0.00000       Min.   : 0.0000                Min.   : 0.0000  
##  1st Qu.: 0.00000       1st Qu.: 0.0000                1st Qu.: 0.0000  
##  Median : 0.00000       Median : 0.0000                Median : 0.0000  
##  Mean   : 0.08712       Mean   : 0.1371                Mean   : 0.2211  
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                3rd Qu.: 0.0000  
##  Max.   :26.00000       Max.   :72.0000                Max.   :21.0000  
##  days_in_waiting_list      adr          required_car_parking_spaces
##  Min.   :  0.000      Min.   :  -6.38   Min.   :0.00000            
##  1st Qu.:  0.000      1st Qu.:  69.29   1st Qu.:0.00000            
##  Median :  0.000      Median :  94.59   Median :0.00000            
##  Mean   :  2.321      Mean   : 101.83   Mean   :0.06252            
##  3rd Qu.:  0.000      3rd Qu.: 126.00   3rd Qu.:0.00000            
##  Max.   :391.000      Max.   :5400.00   Max.   :8.00000            
##  total_of_special_requests
##  Min.   :0.0000           
##  1st Qu.:0.0000           
##  Median :0.0000           
##  Mean   :0.5713           
##  3rd Qu.:1.0000           
##  Max.   :5.0000

berdasarkan hasil summary bahwa interval tidak memiliki nilai yang sama maka akan dilakukan scaling.

prop.table(table(htl_new$is_canceled))
## 
##         0         1 
## 0.6296048 0.3703952
RNGkind(sample.kind = "Rounding")
set.seed(321)
idx <- sample(nrow(htl_new), 0.8*nrow(htl_new))
htl_newtrain <- htl_new[idx, ]
htl_newtest <- htl_new[-idx, ]
prop.table(table(htl_new$is_canceled))
## 
##         0         1 
## 0.6296048 0.3703952

scaling

train_x <- htl_newtrain %>% 
  select_if(is.numeric)
  
# y data train
train_y <- htl_newtrain %>% 
  dplyr::select(is_canceled)
  
# x data test
test_x <- htl_newtest %>% 
  select_if(is.numeric)

# y data test
test_y <- htl_newtest %>% 
  dplyr::select(is_canceled)
train_x <- scale(train_x)


test_x <- scale(test_x, 
                center = attr(train_x, "scaled:center"),
                scale = attr(train_x, "scaled:scale"))

Menentukan nilai K

sqrt(nrow(htl_newtrain))
## [1] 309.0437

Predicting data menggunakan new data test denga algoritma K-NN

library(class)

pred_knn <- knn(train = train_x, test = test_x, cl = train_y$is_canceled, k = 309)

head(pred_knn)
## [1] 0 0 0 0 0 0
## Levels: 0 1

Model Evaluation

confus_knn <- confusionMatrix(as.factor(pred_knn), as.factor(test_y$is_canceled), "1")

Berdasarkan hasil ConfusionMatrix diatas, maka dapat kita ambil informasi bahwa kemampuan model dalam menebak target (Canceled, not canceled) 70.2% , sedangkan persentase kelas canceled berdasarkan data aktual yang berhasil di tebak adalah 38% sedangakan persentase kelas canceled berdasarkan data prediction yang berhasil di tebak adalah 89%%, dan persentase kelas Not canceled berdasarkan data aktual yang berhasil di tebak adalah 89%

Model evaluation Logistics regression dan K-NN

eval_knn <- data_frame(Accuracy = confus_knn$overall[1],
                       Recall = confus_knn$byClass[1],
                       Specificity = confus_knn$byClass[2],
                       Precision = confus_knn$byClass[2])
eval_knn 
## # A tibble: 1 x 4
##   Accuracy Recall Specificity Precision
##      <dbl>  <dbl>       <dbl>     <dbl>
## 1    0.755  0.559       0.869     0.869
eval_logit <- data_frame(Accuracy = confus_logit$overall[1],
                         Recall = confus_logit$byClass[1],
                         Specificity = confus_logit$byClass[2],
                         Precision = confus_logit$byClass[2]) 
eval_logit
## # A tibble: 1 x 4
##   Accuracy Recall Specificity Precision
##      <dbl>  <dbl>       <dbl>     <dbl>
## 1    0.749  0.614       0.829     0.829

Jika dilihat dari kedua model yang telah di buat yaitu menggunakan logistik regression dan K_NN, kemampuan model menebak benar dari data aktual menggunakan model logistik regression dengan nilai precision = 78.40% lebih besar dibandingkan model K-NN