This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
getwd()
## [1] "/Users/harrywang/Desktop/projects/Hotel Booking Prediction"
setwd("/Users/harrywang/Desktop/projects/Hotel\ Booking\ Prediction")
hotel_data <- read.csv("hotel_bookings.csv")
head(hotel_data)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1 27 1 0
## 2 27 1 0
## 3 27 1 0
## 4 27 1 0
## 5 27 1 0
## 6 27 1 0
## stays_in_week_nights adults children babies meal country market_segment
## 1 0 2 0 0 BB PRT Direct
## 2 0 2 0 0 BB PRT Direct
## 3 1 1 0 0 BB GBR Direct
## 4 1 1 0 0 BB GBR Corporate
## 5 2 2 0 0 BB GBR Online TA
## 6 2 2 0 0 BB GBR Online TA
## distribution_channel is_repeated_guest previous_cancellations
## 1 Direct 0 0
## 2 Direct 0 0
## 3 Direct 0 0
## 4 Corporate 0 0
## 5 TA/TO 0 0
## 6 TA/TO 0 0
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1 0 C C
## 2 0 C C
## 3 0 A C
## 4 0 A A
## 5 0 A A
## 6 0 A A
## booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1 3 No Deposit NULL NULL 0 Transient
## 2 4 No Deposit NULL NULL 0 Transient
## 3 0 No Deposit NULL NULL 0 Transient
## 4 0 No Deposit 304 NULL 0 Transient
## 5 0 No Deposit 240 NULL 0 Transient
## 6 0 No Deposit 240 NULL 0 Transient
## adr required_car_parking_spaces total_of_special_requests reservation_status
## 1 0 0 0 Check-Out
## 2 0 0 0 Check-Out
## 3 75 0 0 Check-Out
## 4 75 0 0 Check-Out
## 5 98 0 1 Check-Out
## 6 98 0 1 Check-Out
## reservation_status_date
## 1 2015-07-01
## 2 2015-07-01
## 3 2015-07-02
## 4 2015-07-02
## 5 2015-07-03
## 6 2015-07-03
colnames(hotel_data)
## [1] "hotel" "is_canceled"
## [3] "lead_time" "arrival_date_year"
## [5] "arrival_date_month" "arrival_date_week_number"
## [7] "arrival_date_day_of_month" "stays_in_weekend_nights"
## [9] "stays_in_week_nights" "adults"
## [11] "children" "babies"
## [13] "meal" "country"
## [15] "market_segment" "distribution_channel"
## [17] "is_repeated_guest" "previous_cancellations"
## [19] "previous_bookings_not_canceled" "reserved_room_type"
## [21] "assigned_room_type" "booking_changes"
## [23] "deposit_type" "agent"
## [25] "company" "days_in_waiting_list"
## [27] "customer_type" "adr"
## [29] "required_car_parking_spaces" "total_of_special_requests"
## [31] "reservation_status" "reservation_status_date"
glimpse(hotel_data)
## Rows: 119,390
## Columns: 32
## $ hotel <chr> "Resort Hotel", "Resort Hotel", "Resort…
## $ is_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year <int> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month <chr> "July", "July", "July", "July", "July",…
## $ arrival_date_week_number <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
## $ country <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
## $ market_segment <chr> "Direct", "Direct", "Direct", "Corporat…
## $ distribution_channel <chr> "Direct", "Direct", "Direct", "Corporat…
## $ is_repeated_guest <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
## $ assigned_room_type <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
## $ booking_changes <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type <chr> "No Deposit", "No Deposit", "No Deposit…
## $ agent <chr> "NULL", "NULL", "NULL", "304", "240", "…
## $ company <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
## $ days_in_waiting_list <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type <chr> "Transient", "Transient", "Transient", …
## $ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status <chr> "Check-Out", "Check-Out", "Check-Out", …
## $ reservation_status_date <chr> "2015-07-01", "2015-07-01", "2015-07-02…
str(hotel_data)
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr "July" "July" "July" "July" ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr "BB" "BB" "BB" "BB" ...
## $ country : chr "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr "C" "C" "A" "A" ...
## $ assigned_room_type : chr "C" "C" "C" "A" ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr "NULL" "NULL" "NULL" "304" ...
## $ company : chr "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
Check for nulls:
sum(is.na(hotel_data))
## [1] 4
#View NA values
hotel_data[rowSums(is.na(hotel_data)) > 0, ]
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 40601 City Hotel 1 2 2015 August
## 40668 City Hotel 1 1 2015 August
## 40680 City Hotel 1 1 2015 August
## 41161 City Hotel 1 8 2015 August
## arrival_date_week_number arrival_date_day_of_month
## 40601 32 3
## 40668 32 5
## 40680 32 5
## 41161 33 13
## stays_in_weekend_nights stays_in_week_nights adults children babies meal
## 40601 1 0 2 NA 0 BB
## 40668 0 2 2 NA 0 BB
## 40680 0 2 3 NA 0 BB
## 41161 2 5 2 NA 0 BB
## country market_segment distribution_channel is_repeated_guest
## 40601 PRT Undefined Undefined 0
## 40668 PRT Direct Undefined 0
## 40680 PRT Undefined Undefined 0
## 41161 PRT Online TA Undefined 0
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## 40601 0 0 B
## 40668 0 0 B
## 40680 0 0 B
## 41161 0 0 B
## assigned_room_type booking_changes deposit_type agent company
## 40601 B 0 No Deposit NULL NULL
## 40668 B 0 No Deposit 14 NULL
## 40680 B 0 No Deposit NULL NULL
## 41161 B 0 No Deposit 9 NULL
## days_in_waiting_list customer_type adr required_car_parking_spaces
## 40601 0 Transient-Party 12.0 0
## 40668 0 Transient-Party 12.0 0
## 40680 0 Transient-Party 18.0 0
## 41161 0 Transient-Party 76.5 0
## total_of_special_requests reservation_status reservation_status_date
## 40601 1 Canceled 2015-08-01
## 40668 1 Canceled 2015-08-04
## 40680 2 Canceled 2015-08-04
## 41161 1 Canceled 2015-08-09
summary(hotel_data)
## hotel is_canceled lead_time arrival_date_year
## Length:119390 Min. :0.0000 Min. : 0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:119390 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.:16.00 1st Qu.: 8.0
## Mode :character Median :28.00 Median :16.0
## Mean :27.17 Mean :15.8
## 3rd Qu.:38.00 3rd Qu.:23.0
## Max. :53.00 Max. :31.0
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 Length:119390 Length:119390
## 1st Qu.: 0.0000 1st Qu.: 0.000000 Class :character Class :character
## Median : 0.0000 Median : 0.000000 Mode :character Mode :character
## Mean : 0.1039 Mean : 0.007949
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000
## Max. :10.0000 Max. :10.000000
## NA's :4
## market_segment distribution_channel is_repeated_guest
## Length:119390 Length:119390 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000
## Mean :0.03191
## 3rd Qu.:0.00000
## Max. :1.00000
##
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 Length:119390
## 1st Qu.: 0.00000 1st Qu.: 0.0000 Class :character
## Median : 0.00000 Median : 0.0000 Mode :character
## Mean : 0.08712 Mean : 0.1371
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :26.00000 Max. :72.0000
##
## assigned_room_type booking_changes deposit_type agent
## Length:119390 Min. : 0.0000 Length:119390 Length:119390
## Class :character 1st Qu.: 0.0000 Class :character Class :character
## Mode :character Median : 0.0000 Mode :character Mode :character
## Mean : 0.2211
## 3rd Qu.: 0.0000
## Max. :21.0000
##
## company days_in_waiting_list customer_type adr
## Length:119390 Min. : 0.000 Length:119390 Min. : -6.38
## Class :character 1st Qu.: 0.000 Class :character 1st Qu.: 69.29
## Mode :character Median : 0.000 Mode :character Median : 94.58
## Mean : 2.321 Mean : 101.83
## 3rd Qu.: 0.000 3rd Qu.: 126.00
## Max. :391.000 Max. :5400.00
##
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:119390
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :0.0000 Mode :character
## Mean :0.06252 Mean :0.5714
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :8.00000 Max. :5.0000
##
## reservation_status_date
## Length:119390
## Class :character
## Mode :character
##
##
##
##
nrow(hotel_data)
## [1] 119390
ncol(hotel_data)
## [1] 32
hotel_data%>%
group_by(country)%>%
summarise(num=n())%>%
arrange(desc(num))
## # A tibble: 178 × 2
## country num
## <chr> <int>
## 1 PRT 48590
## 2 GBR 12129
## 3 FRA 10415
## 4 ESP 8568
## 5 DEU 7287
## 6 ITA 3766
## 7 IRL 3375
## 8 BEL 2342
## 9 BRA 2224
## 10 NLD 2104
## # … with 168 more rows
#Converting Variables to factors
hotel_data<-hotel_data%>%
mutate(
hotel=as.factor(hotel),
is_canceled=as.factor(is_canceled),
meal=as.factor(meal),
country=as.factor(country),
market_segment=as.factor(market_segment),
distribution_channel=as.factor(distribution_channel),
is_repeated_guest=as.factor(is_repeated_guest),
reserved_room_type=as.factor(reserved_room_type),
assigned_room_type=as.factor(assigned_room_type),
deposit_type=as.factor(deposit_type),
customer_type=as.factor(customer_type),
reservation_status=as.factor(reservation_status),
agent=as.factor(agent),
company=as.factor(company),
arrival_date_day_of_month=as.factor(arrival_date_day_of_month),
arrival_date_month=as.factor(arrival_date_month),
arrival_date_year=as.factor(arrival_date_year)
)
glimpse(hotel_data)
## Rows: 119,390
## Columns: 32
## $ hotel <fct> Resort Hotel, Resort Hotel, Resort Hote…
## $ is_canceled <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month <fct> July, July, July, July, July, July, Jul…
## $ arrival_date_week_number <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal <fct> BB, BB, BB, BB, BB, BB, BB, FB, BB, HB,…
## $ country <fct> PRT, PRT, GBR, GBR, GBR, GBR, PRT, PRT,…
## $ market_segment <fct> Direct, Direct, Direct, Corporate, Onli…
## $ distribution_channel <fct> Direct, Direct, Direct, Corporate, TA/T…
## $ is_repeated_guest <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type <fct> C, C, A, A, A, A, C, C, A, D, E, D, D, …
## $ assigned_room_type <fct> C, C, C, A, A, A, C, C, A, D, E, D, E, …
## $ booking_changes <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type <fct> No Deposit, No Deposit, No Deposit, No …
## $ agent <fct> NULL, NULL, NULL, 304, 240, 240, NULL, …
## $ company <fct> NULL, NULL, NULL, NULL, NULL, NULL, NUL…
## $ days_in_waiting_list <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type <fct> Transient, Transient, Transient, Transi…
## $ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status <fct> Check-Out, Check-Out, Check-Out, Check-…
## $ reservation_status_date <chr> "2015-07-01", "2015-07-01", "2015-07-02…
hotel_data%>%
filter(adr>1000)
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1 City Hotel 1 35 2016 March
## arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1 13 25 0
## stays_in_week_nights adults children babies meal country market_segment
## 1 1 2 0 0 BB PRT Offline TA/TO
## distribution_channel is_repeated_guest previous_cancellations
## 1 TA/TO 0 0
## previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1 0 A A
## booking_changes deposit_type agent company days_in_waiting_list customer_type
## 1 1 Non Refund 12 NULL 0 Transient
## adr required_car_parking_spaces total_of_special_requests reservation_status
## 1 5400 0 0 Canceled
## reservation_status_date
## 1 2016-02-19
#Since there is only one record above 1000 and has value 5000,
#updating that value with the mean of adr
hotel_data = hotel_data%>%
mutate(adr = replace(adr, adr>1000, mean(adr)))
#Creating two new columns to calculate total number of days stayed and total cost
hotel_data <- hotel_data %>%
mutate(stay_nights_total = stays_in_weekend_nights + stays_in_week_nights,
stay_cost_total = adr * stay_nights_total)
summary(hotel_data$stay_nights_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 3.428 4.000 69.000
summary(hotel_data$stay_cost_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -63.8 146.0 267.0 357.8 446.2 7590.0
#scatter plots with total nights and total cost
ggplot(hotel_data, aes(x=stay_nights_total,y=stay_cost_total,shape=hotel,color=is_canceled))+
geom_point(alpha=1)
#Exploring the data across different market segments
ggplot(hotel_data, aes(x=stay_nights_total,y=stay_cost_total,shape=hotel,color=is_canceled))+
geom_point()+
facet_wrap(~market_segment)
#Comparison of year of Arrival date versus cancellation,
#year 2016 is the one with the most bookings as well as cancellations
hotel_data%>%
ggplot(aes(x=arrival_date_year,fill=is_canceled))+
geom_bar()
#Bar chart of hotel types
hotel_data%>%
ggplot(aes(x=hotel,fill=is_canceled))+
geom_bar()
#Assigned room type (A和D房型最多人預定被取消的也最多!)
hotel_data%>%
ggplot(aes(x=assigned_room_type,fill=is_canceled))+
geom_bar()
#Distribution channel (透過TA和TO預定酒店的最多取消的也最多,其次是直接預訂)
hotel_data%>%
ggplot(aes(x=distribution_channel,fill=is_canceled))+
geom_bar()
#Histogram illustrating Days in waiting list and cancellations
hotel_data%>%
filter(days_in_waiting_list>1)%>%
ggplot(aes(x=days_in_waiting_list,fill=is_canceled))+
geom_histogram(binwidth = 10)
#不需預付訂金的入住率較高
hotel_data%>%
ggplot(aes(x=deposit_type,fill=is_canceled))+
geom_bar()
#Lead time (property management system (PMS)) and hotel cancellations
#從預訂輸入到PMS的那一刻起到入住的時間
hotel_data%>%
ggplot(aes(x=lead_time,fill=is_canceled))+
geom_histogram(binwidth=10,position="dodge")
## Modeling
#Data Modeling
set.seed(1) # set a random seed
index <- sample(nrow(hotel_data), nrow(hotel_data)*0.3) # random selection of indices.
hotel_data <- hotel_data %>%
filter(market_segment!='Undefined')
test <- hotel_data[index,] # save 30% as a test dataset
training <-hotel_data[-index,] # save the rest as a training set
#Based on our initial analysis lets start our modeling with 21 varaibles due to the large nature of the dataset
training_1 <- training[c('hotel','is_canceled','lead_time','adults','children','babies','meal',
'market_segment','distribution_channel','is_repeated_guest',
'previous_cancellations','previous_bookings_not_canceled','reserved_room_type',
'deposit_type','days_in_waiting_list','customer_type','adr',
'required_car_parking_spaces','stay_nights_total','stay_cost_total')]
logit_training_model<-glm(is_canceled~.,family="binomial",data=training_1)
## Warning: glm.fit:擬合機率算出來是數值零或一
summary(logit_training_model)
##
## Call:
## glm(formula = is_canceled ~ ., family = "binomial", data = training_1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4904 -0.8186 -0.4917 0.1866 5.4275
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.562e+00 2.081e-01 -12.314 < 2e-16 ***
## hotelResort Hotel 1.303e-02 2.312e-02 0.563 0.573135
## lead_time 3.254e-03 1.061e-04 30.655 < 2e-16 ***
## adults 9.344e-02 1.732e-02 5.394 6.88e-08 ***
## children 1.183e-01 2.774e-02 4.263 2.01e-05 ***
## babies -5.668e-01 1.101e-01 -5.148 2.63e-07 ***
## mealFB 8.420e-01 1.240e-01 6.793 1.10e-11 ***
## mealHB -1.737e-01 3.143e-02 -5.529 3.22e-08 ***
## mealSC 1.786e-01 2.981e-02 5.990 2.10e-09 ***
## mealUndefined -4.342e-01 1.163e-01 -3.734 0.000189 ***
## market_segmentComplementary -2.278e-01 2.662e-01 -0.856 0.391976
## market_segmentCorporate -3.237e-01 2.029e-01 -1.595 0.110750
## market_segmentDirect -3.190e-01 2.254e-01 -1.415 0.157022
## market_segmentGroups -9.763e-02 2.121e-01 -0.460 0.645291
## market_segmentOffline TA/TO -7.649e-01 2.130e-01 -3.592 0.000329 ***
## market_segmentOnline TA 1.650e-01 2.120e-01 0.778 0.436408
## distribution_channelDirect -4.532e-01 1.105e-01 -4.101 4.11e-05 ***
## distribution_channelGDS -1.299e+00 3.063e-01 -4.242 2.22e-05 ***
## distribution_channelTA/TO 2.169e-02 8.230e-02 0.264 0.792105
## distribution_channelUndefined 8.842e+03 9.160e+05 0.010 0.992298
## is_repeated_guest1 -9.290e-01 9.863e-02 -9.419 < 2e-16 ***
## previous_cancellations 3.045e+00 7.371e-02 41.316 < 2e-16 ***
## previous_bookings_not_canceled -4.847e-01 2.920e-02 -16.601 < 2e-16 ***
## reserved_room_typeB 2.107e-02 8.658e-02 0.243 0.807736
## reserved_room_typeC 2.657e-01 1.039e-01 2.556 0.010578 *
## reserved_room_typeD -4.106e-02 2.531e-02 -1.622 0.104749
## reserved_room_typeE 6.310e-02 4.097e-02 1.540 0.123519
## reserved_room_typeF -1.983e-01 6.452e-02 -3.073 0.002119 **
## reserved_room_typeG 6.456e-02 7.704e-02 0.838 0.402025
## reserved_room_typeH 4.207e-01 1.306e-01 3.221 0.001277 **
## reserved_room_typeL 9.301e-01 9.685e-01 0.960 0.336896
## reserved_room_typeP 1.606e+01 3.104e+02 0.052 0.958741
## deposit_typeNon Refund 5.791e+00 1.362e-01 42.520 < 2e-16 ***
## deposit_typeRefundable 3.534e-01 2.347e-01 1.506 0.132141
## days_in_waiting_list 7.084e-04 5.769e-04 1.228 0.219499
## customer_typeGroup -1.244e-01 2.054e-01 -0.606 0.544611
## customer_typeTransient 1.018e+00 6.291e-02 16.178 < 2e-16 ***
## customer_typeTransient-Party 5.008e-01 6.710e-02 7.463 8.44e-14 ***
## adr 2.537e-03 3.410e-04 7.439 1.01e-13 ***
## required_car_parking_spaces -8.855e+03 9.160e+05 -0.010 0.992286
## stay_nights_total 1.533e-02 6.919e-03 2.216 0.026689 *
## stay_cost_total 2.624e-04 6.310e-05 4.159 3.20e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 110170 on 83571 degrees of freedom
## Residual deviance: 75236 on 83530 degrees of freedom
## AIC: 75320
##
## Number of Fisher Scoring iterations: 13
#Predictions for Logistic Regression
test$logit_pred_prob<-predict(logit_training_model,test,type="response")
test$logit_pred_class<-ifelse(test$logit_pred_prob>0.5,"1","0")
table(test$is_canceled==test$logit_pred_class)
##
## FALSE TRUE
## 7875 27939
#Confusion Matrix
table(test$logit_pred_class,test$is_canceled, dnn=c("predicted","actual"))
## actual
## predicted 0 1
## 0 21650 6982
## 1 893 6289
27939/nrow(test) #Logistic Regression shows an accuracy of 78%
## [1] 0.7800486
#Random Forest Modeling
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## 載入套件:'randomForest'
## 下列物件被遮斷自 'package:dplyr':
##
## combine
## 下列物件被遮斷自 'package:ggplot2':
##
## margin
set.seed(1)
rf_training_model<-randomForest(is_canceled~., # model formula
data=training_1, # use a training dataset for building a model
ntree=500,
cutoff=c(0.5,0.5),
mtry=2,
importance=TRUE)
rf_training_model
##
## Call:
## randomForest(formula = is_canceled ~ ., data = training_1, ntree = 500, cutoff = c(0.5, 0.5), mtry = 2, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 21.79%
## Confusion matrix:
## 0 1 class.error
## 0 51977 646 0.0122760
## 1 17565 13384 0.5675466
#Tuning the model
set.seed(1)
res <- tuneRF(x = training_1%>%select(-is_canceled),
y = training_1$is_canceled,mtryStart=2,
ntreeTry = 500)
## mtry = 2 OOB error = 21.71%
## Searching left ...
## mtry = 1 OOB error = 25.13%
## -0.1574625 0.05
## Searching right ...
## mtry = 4 OOB error = 18.42%
## 0.1513448 0.05
## mtry = 8 OOB error = 17.36%
## 0.05779971 0.05
## mtry = 16 OOB error = 17.65%
## -0.01695616 0.05
rf_best_model<-randomForest(is_canceled~., # model formula
data=training_1, # use a training dataset for building a model
ntree=500,
cutoff=c(0.5,0.5),
mtry=8,
importance=TRUE)
rf_best_model
##
## Call:
## randomForest(formula = is_canceled ~ ., data = training_1, ntree = 500, cutoff = c(0.5, 0.5), mtry = 8, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 17.34%
## Confusion matrix:
## 0 1 class.error
## 0 46760 5863 0.1114152
## 1 8632 22317 0.2789105
test$rf_pred_prob<-predict(rf_best_model,test,type="prob")[,2] #use a test dataset for model evaluation
test$rf_pred_class<-predict(rf_best_model,test,type="class")
table(test$is_canceled==test$rf_pred_class)
##
## FALSE TRUE
## 6365 29449
29451/nrow(test)
## [1] 0.8222632
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.