Exploring and preparing the data
str(hotel_bookings)
## spec_tbl_df [119,390 x 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ hotel : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : num [1:119390] 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr [1:119390] "July" "July" "July" "July" ...
## $ arrival_date_week_number : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
## $ children : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr [1:119390] "BB" "BB" "BB" "BB" ...
## $ country : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr [1:119390] "C" "C" "A" "A" ...
## $ assigned_room_type : chr [1:119390] "C" "C" "C" "A" ...
## $ booking_changes : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
## $ company : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num [1:119390] 0 0 75 75 98 ...
## $ required_car_parking_spaces : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : Date[1:119390], format: "2015-07-01" "2015-07-01" ...
## - attr(*, "spec")=
## .. cols(
## .. hotel = col_character(),
## .. is_canceled = col_double(),
## .. lead_time = col_double(),
## .. arrival_date_year = col_double(),
## .. arrival_date_month = col_character(),
## .. arrival_date_week_number = col_double(),
## .. arrival_date_day_of_month = col_double(),
## .. stays_in_weekend_nights = col_double(),
## .. stays_in_week_nights = col_double(),
## .. adults = col_double(),
## .. children = col_double(),
## .. babies = col_double(),
## .. meal = col_character(),
## .. country = col_character(),
## .. market_segment = col_character(),
## .. distribution_channel = col_character(),
## .. is_repeated_guest = col_double(),
## .. previous_cancellations = col_double(),
## .. previous_bookings_not_canceled = col_double(),
## .. reserved_room_type = col_character(),
## .. assigned_room_type = col_character(),
## .. booking_changes = col_double(),
## .. deposit_type = col_character(),
## .. agent = col_character(),
## .. company = col_character(),
## .. days_in_waiting_list = col_double(),
## .. customer_type = col_character(),
## .. adr = col_double(),
## .. required_car_parking_spaces = col_double(),
## .. total_of_special_requests = col_double(),
## .. reservation_status = col_character(),
## .. reservation_status_date = col_date(format = "")
## .. )
## - attr(*, "problems")=<externalptr>
glimpse(hotel_bookings)
## Rows: 119,390
## Columns: 32
## $ hotel <chr> "Resort Hotel", "Resort Hotel", "Resort~
## $ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, ~
## $ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, ~
## $ arrival_date_year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201~
## $ arrival_date_month <chr> "July", "July", "July", "July", "July",~
## $ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,~
## $ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, ~
## $ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ meal <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB~
## $ country <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR~
## $ market_segment <chr> "Direct", "Direct", "Direct", "Corporat~
## $ distribution_channel <chr> "Direct", "Direct", "Direct", "Corporat~
## $ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ reserved_room_type <chr> "C", "C", "A", "A", "A", "A", "C", "C",~
## $ assigned_room_type <chr> "C", "C", "C", "A", "A", "A", "C", "C",~
## $ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ deposit_type <chr> "No Deposit", "No Deposit", "No Deposit~
## $ agent <chr> "NULL", "NULL", "NULL", "304", "240", "~
## $ company <chr> "NULL", "NULL", "NULL", "NULL", "NULL",~
## $ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ customer_type <chr> "Transient", "Transient", "Transient", ~
## $ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,~
## $ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, ~
## $ reservation_status <chr> "Check-Out", "Check-Out", "Check-Out", ~
## $ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20~
summary(hotel_bookings)
## hotel is_canceled lead_time arrival_date_year
## Length:119390 Min. :0.0000 Min. : 0 Min. :2015
## Class :character 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Mode :character Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## Length:119390 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.:16.00 1st Qu.: 8.0
## Mode :character Median :28.00 Median :16.0
## Mean :27.17 Mean :15.8
## 3rd Qu.:38.00 3rd Qu.:23.0
## Max. :53.00 Max. :31.0
##
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 Length:119390 Length:119390
## 1st Qu.: 0.0000 1st Qu.: 0.000000 Class :character Class :character
## Median : 0.0000 Median : 0.000000 Mode :character Mode :character
## Mean : 0.1039 Mean : 0.007949
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000
## Max. :10.0000 Max. :10.000000
## NA's :4
## market_segment distribution_channel is_repeated_guest
## Length:119390 Length:119390 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000
## Mean :0.03191
## 3rd Qu.:0.00000
## Max. :1.00000
##
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 Length:119390
## 1st Qu.: 0.00000 1st Qu.: 0.0000 Class :character
## Median : 0.00000 Median : 0.0000 Mode :character
## Mean : 0.08712 Mean : 0.1371
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :26.00000 Max. :72.0000
##
## assigned_room_type booking_changes deposit_type agent
## Length:119390 Min. : 0.0000 Length:119390 Length:119390
## Class :character 1st Qu.: 0.0000 Class :character Class :character
## Mode :character Median : 0.0000 Mode :character Mode :character
## Mean : 0.2211
## 3rd Qu.: 0.0000
## Max. :21.0000
##
## company days_in_waiting_list customer_type adr
## Length:119390 Min. : 0.000 Length:119390 Min. : -6.38
## Class :character 1st Qu.: 0.000 Class :character 1st Qu.: 69.29
## Mode :character Median : 0.000 Mode :character Median : 94.58
## Mean : 2.321 Mean : 101.83
## 3rd Qu.: 0.000 3rd Qu.: 126.00
## Max. :391.000 Max. :5400.00
##
## required_car_parking_spaces total_of_special_requests reservation_status
## Min. :0.00000 Min. :0.0000 Length:119390
## 1st Qu.:0.00000 1st Qu.:0.0000 Class :character
## Median :0.00000 Median :0.0000 Mode :character
## Mean :0.06252 Mean :0.5714
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :8.00000 Max. :5.0000
##
## reservation_status_date
## Min. :2014-10-17
## 1st Qu.:2016-02-01
## Median :2016-08-07
## Mean :2016-07-30
## 3rd Qu.:2017-02-08
## Max. :2017-09-14
##
A lot of these categorical data are as character data and therefore they do not have a lot of information
Converting categorical and binary variables into factor for bettering understanding and exploring
hotel_bookings <- hotel_bookings %>%
mutate(hotel = as.factor(hotel),
is_canceled = as.factor(is_canceled),
arrival_date_day_of_month = as.factor(arrival_date_day_of_month),
arrival_date_month = as.factor(arrival_date_month),
arrival_date_year = as.factor(arrival_date_year),
meal = as.factor(meal),
country = as.factor(country),
market_segment = as.factor(market_segment),
distribution_channel = as.factor(distribution_channel),
is_repeated_guest = as.factor(is_repeated_guest),
reserved_room_type = as.factor(reserved_room_type),
assigned_room_type = as.factor(assigned_room_type),
deposit_type = as.factor(deposit_type),
customer_type = as.factor(customer_type),
reservation_status = as.factor(reservation_status),
agent = as.factor(agent),
company = as.factor(company)
)
glimpse(hotel_bookings) #seeing if the conversion worked
## Rows: 119,390
## Columns: 32
## $ hotel <fct> Resort Hotel, Resort Hotel, Resort Hote~
## $ is_canceled <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, ~
## $ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, ~
## $ arrival_date_year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 201~
## $ arrival_date_month <fct> July, July, July, July, July, July, Jul~
## $ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,~
## $ arrival_date_day_of_month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, ~
## $ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~
## $ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ meal <fct> BB, BB, BB, BB, BB, BB, BB, FB, BB, HB,~
## $ country <fct> PRT, PRT, GBR, GBR, GBR, GBR, PRT, PRT,~
## $ market_segment <fct> Direct, Direct, Direct, Corporate, Onli~
## $ distribution_channel <fct> Direct, Direct, Direct, Corporate, TA/T~
## $ is_repeated_guest <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ reserved_room_type <fct> C, C, A, A, A, A, C, C, A, D, E, D, D, ~
## $ assigned_room_type <fct> C, C, C, A, A, A, C, C, A, D, E, D, E, ~
## $ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ deposit_type <fct> No Deposit, No Deposit, No Deposit, No ~
## $ agent <fct> NULL, NULL, NULL, 304, 240, 240, NULL, ~
## $ company <fct> NULL, NULL, NULL, NULL, NULL, NULL, NUL~
## $ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ customer_type <fct> Transient, Transient, Transient, Transi~
## $ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,~
## $ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, ~
## $ reservation_status <fct> Check-Out, Check-Out, Check-Out, Check-~
## $ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20~