R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.7      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
getwd()
## [1] "/Users/harrywang/Desktop/projects/Hotel Booking Prediction"
setwd("/Users/harrywang/Desktop/projects/Hotel\ Booking\ Prediction")
hotel_data <- read.csv("hotel_bookings.csv")
head(hotel_data)
##          hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel           0       342              2015               July
## 2 Resort Hotel           0       737              2015               July
## 3 Resort Hotel           0         7              2015               July
## 4 Resort Hotel           0        13              2015               July
## 5 Resort Hotel           0        14              2015               July
## 6 Resort Hotel           0        14              2015               July
##   arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1                       27                         1                       0
## 2                       27                         1                       0
## 3                       27                         1                       0
## 4                       27                         1                       0
## 5                       27                         1                       0
## 6                       27                         1                       0
##   stays_in_week_nights adults children babies meal country market_segment
## 1                    0      2        0      0   BB     PRT         Direct
## 2                    0      2        0      0   BB     PRT         Direct
## 3                    1      1        0      0   BB     GBR         Direct
## 4                    1      1        0      0   BB     GBR      Corporate
## 5                    2      2        0      0   BB     GBR      Online TA
## 6                    2      2        0      0   BB     GBR      Online TA
##   distribution_channel is_repeated_guest previous_cancellations
## 1               Direct                 0                      0
## 2               Direct                 0                      0
## 3               Direct                 0                      0
## 4            Corporate                 0                      0
## 5                TA/TO                 0                      0
## 6                TA/TO                 0                      0
##   previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1                              0                  C                  C
## 2                              0                  C                  C
## 3                              0                  A                  C
## 4                              0                  A                  A
## 5                              0                  A                  A
## 6                              0                  A                  A
##   booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1               3   No Deposit  NULL    NULL                    0     Transient
## 2               4   No Deposit  NULL    NULL                    0     Transient
## 3               0   No Deposit  NULL    NULL                    0     Transient
## 4               0   No Deposit   304    NULL                    0     Transient
## 5               0   No Deposit   240    NULL                    0     Transient
## 6               0   No Deposit   240    NULL                    0     Transient
##   adr required_car_parking_spaces total_of_special_requests reservation_status
## 1   0                           0                         0          Check-Out
## 2   0                           0                         0          Check-Out
## 3  75                           0                         0          Check-Out
## 4  75                           0                         0          Check-Out
## 5  98                           0                         1          Check-Out
## 6  98                           0                         1          Check-Out
##   reservation_status_date
## 1              2015-07-01
## 2              2015-07-01
## 3              2015-07-02
## 4              2015-07-02
## 5              2015-07-03
## 6              2015-07-03
colnames(hotel_data)
##  [1] "hotel"                          "is_canceled"                   
##  [3] "lead_time"                      "arrival_date_year"             
##  [5] "arrival_date_month"             "arrival_date_week_number"      
##  [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
##  [9] "stays_in_week_nights"           "adults"                        
## [11] "children"                       "babies"                        
## [13] "meal"                           "country"                       
## [15] "market_segment"                 "distribution_channel"          
## [17] "is_repeated_guest"              "previous_cancellations"        
## [19] "previous_bookings_not_canceled" "reserved_room_type"            
## [21] "assigned_room_type"             "booking_changes"               
## [23] "deposit_type"                   "agent"                         
## [25] "company"                        "days_in_waiting_list"          
## [27] "customer_type"                  "adr"                           
## [29] "required_car_parking_spaces"    "total_of_special_requests"     
## [31] "reservation_status"             "reservation_status_date"
glimpse(hotel_data)
## Rows: 119,390
## Columns: 32
## $ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
## $ is_canceled                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time                      <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year              <int> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
## $ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
## $ is_repeated_guest              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
## $ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240", "…
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
## $ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type                  <chr> "Transient", "Transient", "Transient", …
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
## $ reservation_status_date        <chr> "2015-07-01", "2015-07-01", "2015-07-02…
str(hotel_data)
## 'data.frame':    119390 obs. of  32 variables:
##  $ hotel                         : chr  "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
##  $ is_canceled                   : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ lead_time                     : int  342 737 7 13 14 14 0 9 85 75 ...
##  $ arrival_date_year             : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ arrival_date_month            : chr  "July" "July" "July" "July" ...
##  $ arrival_date_week_number      : int  27 27 27 27 27 27 27 27 27 27 ...
##  $ arrival_date_day_of_month     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : int  0 0 1 1 2 2 2 2 3 3 ...
##  $ adults                        : int  2 2 1 1 2 2 2 2 2 2 ...
##  $ children                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ babies                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ meal                          : chr  "BB" "BB" "BB" "BB" ...
##  $ country                       : chr  "PRT" "PRT" "GBR" "GBR" ...
##  $ market_segment                : chr  "Direct" "Direct" "Direct" "Corporate" ...
##  $ distribution_channel          : chr  "Direct" "Direct" "Direct" "Corporate" ...
##  $ is_repeated_guest             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_cancellations        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : chr  "C" "C" "A" "A" ...
##  $ assigned_room_type            : chr  "C" "C" "C" "A" ...
##  $ booking_changes               : int  3 4 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : chr  "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
##  $ agent                         : chr  "NULL" "NULL" "NULL" "304" ...
##  $ company                       : chr  "NULL" "NULL" "NULL" "NULL" ...
##  $ days_in_waiting_list          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : chr  "Transient" "Transient" "Transient" "Transient" ...
##  $ adr                           : num  0 0 75 75 98 ...
##  $ required_car_parking_spaces   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : int  0 0 0 0 1 1 0 1 1 0 ...
##  $ reservation_status            : chr  "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
##  $ reservation_status_date       : chr  "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...

Data cleaning

Check for nulls:

sum(is.na(hotel_data))
## [1] 4
#View NA values
hotel_data[rowSums(is.na(hotel_data)) > 0, ] 
##            hotel is_canceled lead_time arrival_date_year arrival_date_month
## 40601 City Hotel           1         2              2015             August
## 40668 City Hotel           1         1              2015             August
## 40680 City Hotel           1         1              2015             August
## 41161 City Hotel           1         8              2015             August
##       arrival_date_week_number arrival_date_day_of_month
## 40601                       32                         3
## 40668                       32                         5
## 40680                       32                         5
## 41161                       33                        13
##       stays_in_weekend_nights stays_in_week_nights adults children babies meal
## 40601                       1                    0      2       NA      0   BB
## 40668                       0                    2      2       NA      0   BB
## 40680                       0                    2      3       NA      0   BB
## 41161                       2                    5      2       NA      0   BB
##       country market_segment distribution_channel is_repeated_guest
## 40601     PRT      Undefined            Undefined                 0
## 40668     PRT         Direct            Undefined                 0
## 40680     PRT      Undefined            Undefined                 0
## 41161     PRT      Online TA            Undefined                 0
##       previous_cancellations previous_bookings_not_canceled reserved_room_type
## 40601                      0                              0                  B
## 40668                      0                              0                  B
## 40680                      0                              0                  B
## 41161                      0                              0                  B
##       assigned_room_type booking_changes deposit_type agent company
## 40601                  B               0   No Deposit  NULL    NULL
## 40668                  B               0   No Deposit    14    NULL
## 40680                  B               0   No Deposit  NULL    NULL
## 41161                  B               0   No Deposit     9    NULL
##       days_in_waiting_list   customer_type  adr required_car_parking_spaces
## 40601                    0 Transient-Party 12.0                           0
## 40668                    0 Transient-Party 12.0                           0
## 40680                    0 Transient-Party 18.0                           0
## 41161                    0 Transient-Party 76.5                           0
##       total_of_special_requests reservation_status reservation_status_date
## 40601                         1           Canceled              2015-08-01
## 40668                         1           Canceled              2015-08-04
## 40680                         2           Canceled              2015-08-04
## 41161                         1           Canceled              2015-08-09
summary(hotel_data)
##     hotel            is_canceled       lead_time   arrival_date_year
##  Length:119390      Min.   :0.0000   Min.   :  0   Min.   :2015     
##  Class :character   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
##  Mode  :character   Median :0.0000   Median : 69   Median :2016     
##                     Mean   :0.3704   Mean   :104   Mean   :2016     
##                     3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
##                     Max.   :1.0000   Max.   :737   Max.   :2017     
##                                                                     
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  Length:119390      Min.   : 1.00            Min.   : 1.0             
##  Class :character   1st Qu.:16.00            1st Qu.: 8.0             
##  Mode  :character   Median :28.00            Median :16.0             
##                     Mean   :27.17            Mean   :15.8             
##                     3rd Qu.:38.00            3rd Qu.:23.0             
##                     Max.   :53.00            Max.   :31.0             
##                                                                       
##  stays_in_weekend_nights stays_in_week_nights     adults      
##  Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
##  1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
##  Median : 1.0000         Median : 2.0         Median : 2.000  
##  Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
##  3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
##  Max.   :19.0000         Max.   :50.0         Max.   :55.000  
##                                                               
##     children           babies              meal             country         
##  Min.   : 0.0000   Min.   : 0.000000   Length:119390      Length:119390     
##  1st Qu.: 0.0000   1st Qu.: 0.000000   Class :character   Class :character  
##  Median : 0.0000   Median : 0.000000   Mode  :character   Mode  :character  
##  Mean   : 0.1039   Mean   : 0.007949                                        
##  3rd Qu.: 0.0000   3rd Qu.: 0.000000                                        
##  Max.   :10.0000   Max.   :10.000000                                        
##  NA's   :4                                                                  
##  market_segment     distribution_channel is_repeated_guest
##  Length:119390      Length:119390        Min.   :0.00000  
##  Class :character   Class :character     1st Qu.:0.00000  
##  Mode  :character   Mode  :character     Median :0.00000  
##                                          Mean   :0.03191  
##                                          3rd Qu.:0.00000  
##                                          Max.   :1.00000  
##                                                           
##  previous_cancellations previous_bookings_not_canceled reserved_room_type
##  Min.   : 0.00000       Min.   : 0.0000                Length:119390     
##  1st Qu.: 0.00000       1st Qu.: 0.0000                Class :character  
##  Median : 0.00000       Median : 0.0000                Mode  :character  
##  Mean   : 0.08712       Mean   : 0.1371                                  
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                                  
##  Max.   :26.00000       Max.   :72.0000                                  
##                                                                          
##  assigned_room_type booking_changes   deposit_type          agent          
##  Length:119390      Min.   : 0.0000   Length:119390      Length:119390     
##  Class :character   1st Qu.: 0.0000   Class :character   Class :character  
##  Mode  :character   Median : 0.0000   Mode  :character   Mode  :character  
##                     Mean   : 0.2211                                        
##                     3rd Qu.: 0.0000                                        
##                     Max.   :21.0000                                        
##                                                                            
##    company          days_in_waiting_list customer_type           adr         
##  Length:119390      Min.   :  0.000      Length:119390      Min.   :  -6.38  
##  Class :character   1st Qu.:  0.000      Class :character   1st Qu.:  69.29  
##  Mode  :character   Median :  0.000      Mode  :character   Median :  94.58  
##                     Mean   :  2.321                         Mean   : 101.83  
##                     3rd Qu.:  0.000                         3rd Qu.: 126.00  
##                     Max.   :391.000                         Max.   :5400.00  
##                                                                              
##  required_car_parking_spaces total_of_special_requests reservation_status
##  Min.   :0.00000             Min.   :0.0000            Length:119390     
##  1st Qu.:0.00000             1st Qu.:0.0000            Class :character  
##  Median :0.00000             Median :0.0000            Mode  :character  
##  Mean   :0.06252             Mean   :0.5714                              
##  3rd Qu.:0.00000             3rd Qu.:1.0000                              
##  Max.   :8.00000             Max.   :5.0000                              
##                                                                          
##  reservation_status_date
##  Length:119390          
##  Class :character       
##  Mode  :character       
##                         
##                         
##                         
## 
nrow(hotel_data)
## [1] 119390
ncol(hotel_data)
## [1] 32

Exploring the number of countries invloved

hotel_data%>%
  group_by(country)%>%
  summarise(num=n())%>%
  arrange(desc(num))
## # A tibble: 178 × 2
##    country   num
##    <chr>   <int>
##  1 PRT     48590
##  2 GBR     12129
##  3 FRA     10415
##  4 ESP      8568
##  5 DEU      7287
##  6 ITA      3766
##  7 IRL      3375
##  8 BEL      2342
##  9 BRA      2224
## 10 NLD      2104
## # … with 168 more rows
#Converting Variables to factors
hotel_data<-hotel_data%>%
  mutate(
    hotel=as.factor(hotel),      
    is_canceled=as.factor(is_canceled),
    meal=as.factor(meal),
    country=as.factor(country),
    market_segment=as.factor(market_segment),
    distribution_channel=as.factor(distribution_channel),
    is_repeated_guest=as.factor(is_repeated_guest),
    reserved_room_type=as.factor(reserved_room_type),
    assigned_room_type=as.factor(assigned_room_type),
    deposit_type=as.factor(deposit_type),
    customer_type=as.factor(customer_type),
    reservation_status=as.factor(reservation_status),
    agent=as.factor(agent),
    company=as.factor(company),
    arrival_date_day_of_month=as.factor(arrival_date_day_of_month),
    arrival_date_month=as.factor(arrival_date_month),
    arrival_date_year=as.factor(arrival_date_year)
 )

glimpse(hotel_data)
## Rows: 119,390
## Columns: 32
## $ hotel                          <fct> Resort Hotel, Resort Hotel, Resort Hote…
## $ is_canceled                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time                      <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year              <fct> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month             <fct> July, July, July, July, July, July, Jul…
## $ arrival_date_week_number       <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month      <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights           <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults                         <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal                           <fct> BB, BB, BB, BB, BB, BB, BB, FB, BB, HB,…
## $ country                        <fct> PRT, PRT, GBR, GBR, GBR, GBR, PRT, PRT,…
## $ market_segment                 <fct> Direct, Direct, Direct, Corporate, Onli…
## $ distribution_channel           <fct> Direct, Direct, Direct, Corporate, TA/T…
## $ is_repeated_guest              <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type             <fct> C, C, A, A, A, A, C, C, A, D, E, D, D, …
## $ assigned_room_type             <fct> C, C, C, A, A, A, C, C, A, D, E, D, E, …
## $ booking_changes                <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type                   <fct> No Deposit, No Deposit, No Deposit, No …
## $ agent                          <fct> NULL, NULL, NULL, 304, 240, 240, NULL, …
## $ company                        <fct> NULL, NULL, NULL, NULL, NULL, NULL, NUL…
## $ days_in_waiting_list           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type                  <fct> Transient, Transient, Transient, Transi…
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status             <fct> Check-Out, Check-Out, Check-Out, Check-…
## $ reservation_status_date        <chr> "2015-07-01", "2015-07-01", "2015-07-02…

Checking for outliers

hotel_data%>%
  filter(adr>1000)
##        hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 City Hotel           1        35              2016              March
##   arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1                       13                        25                       0
##   stays_in_week_nights adults children babies meal country market_segment
## 1                    1      2        0      0   BB     PRT  Offline TA/TO
##   distribution_channel is_repeated_guest previous_cancellations
## 1                TA/TO                 0                      0
##   previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1                              0                  A                  A
##   booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1               1   Non Refund    12    NULL                    0     Transient
##    adr required_car_parking_spaces total_of_special_requests reservation_status
## 1 5400                           0                         0           Canceled
##   reservation_status_date
## 1              2016-02-19
#Since there is only one record above 1000 and has value 5000, 
#updating that value with the mean of adr
hotel_data = hotel_data%>%
  mutate(adr = replace(adr, adr>1000, mean(adr)))

#Creating two new columns to calculate total number of days stayed and total cost

hotel_data <- hotel_data %>% 
  mutate(stay_nights_total = stays_in_weekend_nights + stays_in_week_nights,
         stay_cost_total = adr * stay_nights_total)

summary(hotel_data$stay_nights_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   3.428   4.000  69.000
summary(hotel_data$stay_cost_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -63.8   146.0   267.0   357.8   446.2  7590.0

EDA Plot

#scatter plots with total nights and total cost
ggplot(hotel_data, aes(x=stay_nights_total,y=stay_cost_total,shape=hotel,color=is_canceled))+
  geom_point(alpha=1)

#Exploring the data across different market segments
ggplot(hotel_data, aes(x=stay_nights_total,y=stay_cost_total,shape=hotel,color=is_canceled))+
  geom_point()+
  facet_wrap(~market_segment)

#Comparison of year of Arrival date versus cancellation, 
#year 2016 is the one with the most bookings as well as cancellations
hotel_data%>%
  ggplot(aes(x=arrival_date_year,fill=is_canceled))+
  geom_bar()

#Bar chart of hotel types
hotel_data%>%
  ggplot(aes(x=hotel,fill=is_canceled))+
  geom_bar()

#Assigned room type (A和D房型最多人預定被取消的也最多!)
hotel_data%>%
  ggplot(aes(x=assigned_room_type,fill=is_canceled))+
  geom_bar()

#Distribution channel (透過TA和TO預定酒店的最多取消的也最多,其次是直接預訂)
hotel_data%>%
  ggplot(aes(x=distribution_channel,fill=is_canceled))+
  geom_bar()

#Histogram illustrating Days in waiting list and cancellations
hotel_data%>%
  filter(days_in_waiting_list>1)%>%
  ggplot(aes(x=days_in_waiting_list,fill=is_canceled))+
  geom_histogram(binwidth = 10)

#不需預付訂金的入住率較高
hotel_data%>%
  ggplot(aes(x=deposit_type,fill=is_canceled))+
  geom_bar()

#Lead time (property management system (PMS)) and hotel cancellations
#從預訂輸入到PMS的那一刻起到入住的時間
hotel_data%>%
  ggplot(aes(x=lead_time,fill=is_canceled))+
  geom_histogram(binwidth=10,position="dodge")

## Modeling

#Data Modeling
set.seed(1)   # set a random seed 
index <- sample(nrow(hotel_data), nrow(hotel_data)*0.3) # random selection of indices. 

hotel_data <- hotel_data %>%
  filter(market_segment!='Undefined')

test <- hotel_data[index,]       # save 30% as a test dataset
training <-hotel_data[-index,]   # save the rest as a training set


#Based on our initial analysis lets start our modeling with 21 varaibles due to the large nature of the dataset
training_1 <- training[c('hotel','is_canceled','lead_time','adults','children','babies','meal',
                         'market_segment','distribution_channel','is_repeated_guest',
                         'previous_cancellations','previous_bookings_not_canceled','reserved_room_type',
                         'deposit_type','days_in_waiting_list','customer_type','adr',
                         'required_car_parking_spaces','stay_nights_total','stay_cost_total')]

logit_training_model<-glm(is_canceled~.,family="binomial",data=training_1)
## Warning: glm.fit:擬合機率算出來是數值零或一
summary(logit_training_model)
## 
## Call:
## glm(formula = is_canceled ~ ., family = "binomial", data = training_1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.4904  -0.8186  -0.4917   0.1866   5.4275  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -2.562e+00  2.081e-01 -12.314  < 2e-16 ***
## hotelResort Hotel               1.303e-02  2.312e-02   0.563 0.573135    
## lead_time                       3.254e-03  1.061e-04  30.655  < 2e-16 ***
## adults                          9.344e-02  1.732e-02   5.394 6.88e-08 ***
## children                        1.183e-01  2.774e-02   4.263 2.01e-05 ***
## babies                         -5.668e-01  1.101e-01  -5.148 2.63e-07 ***
## mealFB                          8.420e-01  1.240e-01   6.793 1.10e-11 ***
## mealHB                         -1.737e-01  3.143e-02  -5.529 3.22e-08 ***
## mealSC                          1.786e-01  2.981e-02   5.990 2.10e-09 ***
## mealUndefined                  -4.342e-01  1.163e-01  -3.734 0.000189 ***
## market_segmentComplementary    -2.278e-01  2.662e-01  -0.856 0.391976    
## market_segmentCorporate        -3.237e-01  2.029e-01  -1.595 0.110750    
## market_segmentDirect           -3.190e-01  2.254e-01  -1.415 0.157022    
## market_segmentGroups           -9.763e-02  2.121e-01  -0.460 0.645291    
## market_segmentOffline TA/TO    -7.649e-01  2.130e-01  -3.592 0.000329 ***
## market_segmentOnline TA         1.650e-01  2.120e-01   0.778 0.436408    
## distribution_channelDirect     -4.532e-01  1.105e-01  -4.101 4.11e-05 ***
## distribution_channelGDS        -1.299e+00  3.063e-01  -4.242 2.22e-05 ***
## distribution_channelTA/TO       2.169e-02  8.230e-02   0.264 0.792105    
## distribution_channelUndefined   8.842e+03  9.160e+05   0.010 0.992298    
## is_repeated_guest1             -9.290e-01  9.863e-02  -9.419  < 2e-16 ***
## previous_cancellations          3.045e+00  7.371e-02  41.316  < 2e-16 ***
## previous_bookings_not_canceled -4.847e-01  2.920e-02 -16.601  < 2e-16 ***
## reserved_room_typeB             2.107e-02  8.658e-02   0.243 0.807736    
## reserved_room_typeC             2.657e-01  1.039e-01   2.556 0.010578 *  
## reserved_room_typeD            -4.106e-02  2.531e-02  -1.622 0.104749    
## reserved_room_typeE             6.310e-02  4.097e-02   1.540 0.123519    
## reserved_room_typeF            -1.983e-01  6.452e-02  -3.073 0.002119 ** 
## reserved_room_typeG             6.456e-02  7.704e-02   0.838 0.402025    
## reserved_room_typeH             4.207e-01  1.306e-01   3.221 0.001277 ** 
## reserved_room_typeL             9.301e-01  9.685e-01   0.960 0.336896    
## reserved_room_typeP             1.606e+01  3.104e+02   0.052 0.958741    
## deposit_typeNon Refund          5.791e+00  1.362e-01  42.520  < 2e-16 ***
## deposit_typeRefundable          3.534e-01  2.347e-01   1.506 0.132141    
## days_in_waiting_list            7.084e-04  5.769e-04   1.228 0.219499    
## customer_typeGroup             -1.244e-01  2.054e-01  -0.606 0.544611    
## customer_typeTransient          1.018e+00  6.291e-02  16.178  < 2e-16 ***
## customer_typeTransient-Party    5.008e-01  6.710e-02   7.463 8.44e-14 ***
## adr                             2.537e-03  3.410e-04   7.439 1.01e-13 ***
## required_car_parking_spaces    -8.855e+03  9.160e+05  -0.010 0.992286    
## stay_nights_total               1.533e-02  6.919e-03   2.216 0.026689 *  
## stay_cost_total                 2.624e-04  6.310e-05   4.159 3.20e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 110170  on 83571  degrees of freedom
## Residual deviance:  75236  on 83530  degrees of freedom
## AIC: 75320
## 
## Number of Fisher Scoring iterations: 13
#Predictions for Logistic Regression
test$logit_pred_prob<-predict(logit_training_model,test,type="response")
test$logit_pred_class<-ifelse(test$logit_pred_prob>0.5,"1","0") 

table(test$is_canceled==test$logit_pred_class)
## 
## FALSE  TRUE 
##  7875 27939
#Confusion Matrix
table(test$logit_pred_class,test$is_canceled, dnn=c("predicted","actual"))
##          actual
## predicted     0     1
##         0 21650  6982
##         1   893  6289
27939/nrow(test) #Logistic Regression shows an accuracy of 78%
## [1] 0.7800486
#Random Forest Modeling
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## 載入套件:'randomForest'
## 下列物件被遮斷自 'package:dplyr':
## 
##     combine
## 下列物件被遮斷自 'package:ggplot2':
## 
##     margin
set.seed(1)
rf_training_model<-randomForest(is_canceled~.,    # model formula
                                data=training_1,          # use a training dataset for building a model
                                ntree=500,                     
                                cutoff=c(0.5,0.5), 
                                mtry=2,
                                importance=TRUE)
rf_training_model
## 
## Call:
##  randomForest(formula = is_canceled ~ ., data = training_1, ntree = 500,      cutoff = c(0.5, 0.5), mtry = 2, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 21.79%
## Confusion matrix:
##       0     1 class.error
## 0 51977   646   0.0122760
## 1 17565 13384   0.5675466
#Tuning the model
set.seed(1)              
res <- tuneRF(x = training_1%>%select(-is_canceled),
              y = training_1$is_canceled,mtryStart=2,
              ntreeTry = 500)
## mtry = 2  OOB error = 21.71% 
## Searching left ...
## mtry = 1     OOB error = 25.13% 
## -0.1574625 0.05 
## Searching right ...
## mtry = 4     OOB error = 18.42% 
## 0.1513448 0.05 
## mtry = 8     OOB error = 17.36% 
## 0.05779971 0.05 
## mtry = 16    OOB error = 17.65% 
## -0.01695616 0.05

rf_best_model<-randomForest(is_canceled~.,              # model formula
                            data=training_1,          # use a training dataset for building a model
                            ntree=500,                     
                            cutoff=c(0.5,0.5), 
                            mtry=8,
                            importance=TRUE)
rf_best_model
## 
## Call:
##  randomForest(formula = is_canceled ~ ., data = training_1, ntree = 500,      cutoff = c(0.5, 0.5), mtry = 8, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 17.34%
## Confusion matrix:
##       0     1 class.error
## 0 46760  5863   0.1114152
## 1  8632 22317   0.2789105
test$rf_pred_prob<-predict(rf_best_model,test,type="prob")[,2]   #use a test dataset for model evaluation
test$rf_pred_class<-predict(rf_best_model,test,type="class")

table(test$is_canceled==test$rf_pred_class)
## 
## FALSE  TRUE 
##  6365 29449
29451/nrow(test)
## [1] 0.8222632

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.