Quick look of the data structure
## [1] "hotel" "is_canceled"
## [3] "lead_time" "arrival_date_year"
## [5] "arrival_date_month" "arrival_date_week_number"
## [7] "arrival_date_day_of_month" "stays_in_weekend_nights"
## [9] "stays_in_week_nights" "adults"
## [11] "children" "babies"
## [13] "meal" "country"
## [15] "market_segment" "distribution_channel"
## [17] "is_repeated_guest" "previous_cancellations"
## [19] "previous_bookings_not_canceled" "reserved_room_type"
## [21] "assigned_room_type" "booking_changes"
## [23] "deposit_type" "agent"
## [25] "company" "days_in_waiting_list"
## [27] "customer_type" "adr"
## [29] "required_car_parking_spaces" "total_of_special_requests"
## [31] "reservation_status" "reservation_status_date"
## 'data.frame': 119390 obs. of 32 variables:
## $ hotel : chr "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : int 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : int 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr "July" "July" "July" "July" ...
## $ arrival_date_week_number : int 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : int 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : int 2 2 1 1 2 2 2 2 2 2 ...
## $ children : int 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : int 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr "BB" "BB" "BB" "BB" ...
## $ country : chr "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: int 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr "C" "C" "A" "A" ...
## $ assigned_room_type : chr "C" "C" "C" "A" ...
## $ booking_changes : int 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr "NULL" "NULL" "NULL" "304" ...
## $ company : chr "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : int 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num 0 0 75 75 98 ...
## $ required_car_parking_spaces : int 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : int 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr "2015-07-01" "2015-07-01" "2015-07-02" "2015-07-02" ...
Data summary
| Name |
hotel_bookings |
| Number of rows |
119390 |
| Number of columns |
32 |
| _______________________ |
|
| Column type frequency: |
|
| character |
14 |
| numeric |
18 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| hotel |
0 |
1 |
10 |
12 |
0 |
2 |
0 |
| arrival_date_month |
0 |
1 |
3 |
9 |
0 |
12 |
0 |
| meal |
0 |
1 |
2 |
9 |
0 |
5 |
0 |
| country |
0 |
1 |
2 |
4 |
0 |
178 |
0 |
| market_segment |
0 |
1 |
6 |
13 |
0 |
8 |
0 |
| distribution_channel |
0 |
1 |
3 |
9 |
0 |
5 |
0 |
| reserved_room_type |
0 |
1 |
1 |
1 |
0 |
10 |
0 |
| assigned_room_type |
0 |
1 |
1 |
1 |
0 |
12 |
0 |
| deposit_type |
0 |
1 |
10 |
10 |
0 |
3 |
0 |
| agent |
0 |
1 |
1 |
4 |
0 |
334 |
0 |
| company |
0 |
1 |
1 |
4 |
0 |
353 |
0 |
| customer_type |
0 |
1 |
5 |
15 |
0 |
4 |
0 |
| reservation_status |
0 |
1 |
7 |
9 |
0 |
3 |
0 |
| reservation_status_date |
0 |
1 |
10 |
10 |
0 |
926 |
0 |
Variable type: numeric
| is_canceled |
0 |
1 |
0.37 |
0.48 |
0.00 |
0.00 |
0.00 |
1 |
1 |
▇▁▁▁▅ |
| lead_time |
0 |
1 |
104.01 |
106.86 |
0.00 |
18.00 |
69.00 |
160 |
737 |
▇▂▁▁▁ |
| arrival_date_year |
0 |
1 |
2016.16 |
0.71 |
2015.00 |
2016.00 |
2016.00 |
2017 |
2017 |
▃▁▇▁▆ |
| arrival_date_week_number |
0 |
1 |
27.17 |
13.61 |
1.00 |
16.00 |
28.00 |
38 |
53 |
▅▇▇▇▅ |
| arrival_date_day_of_month |
0 |
1 |
15.80 |
8.78 |
1.00 |
8.00 |
16.00 |
23 |
31 |
▇▇▇▇▆ |
| stays_in_weekend_nights |
0 |
1 |
0.93 |
1.00 |
0.00 |
0.00 |
1.00 |
2 |
19 |
▇▁▁▁▁ |
| stays_in_week_nights |
0 |
1 |
2.50 |
1.91 |
0.00 |
1.00 |
2.00 |
3 |
50 |
▇▁▁▁▁ |
| adults |
0 |
1 |
1.86 |
0.58 |
0.00 |
2.00 |
2.00 |
2 |
55 |
▇▁▁▁▁ |
| children |
4 |
1 |
0.10 |
0.40 |
0.00 |
0.00 |
0.00 |
0 |
10 |
▇▁▁▁▁ |
| babies |
0 |
1 |
0.01 |
0.10 |
0.00 |
0.00 |
0.00 |
0 |
10 |
▇▁▁▁▁ |
| is_repeated_guest |
0 |
1 |
0.03 |
0.18 |
0.00 |
0.00 |
0.00 |
0 |
1 |
▇▁▁▁▁ |
| previous_cancellations |
0 |
1 |
0.09 |
0.84 |
0.00 |
0.00 |
0.00 |
0 |
26 |
▇▁▁▁▁ |
| previous_bookings_not_canceled |
0 |
1 |
0.14 |
1.50 |
0.00 |
0.00 |
0.00 |
0 |
72 |
▇▁▁▁▁ |
| booking_changes |
0 |
1 |
0.22 |
0.65 |
0.00 |
0.00 |
0.00 |
0 |
21 |
▇▁▁▁▁ |
| days_in_waiting_list |
0 |
1 |
2.32 |
17.59 |
0.00 |
0.00 |
0.00 |
0 |
391 |
▇▁▁▁▁ |
| adr |
0 |
1 |
101.83 |
50.54 |
-6.38 |
69.29 |
94.58 |
126 |
5400 |
▇▁▁▁▁ |
| required_car_parking_spaces |
0 |
1 |
0.06 |
0.25 |
0.00 |
0.00 |
0.00 |
0 |
8 |
▇▁▁▁▁ |
| total_of_special_requests |
0 |
1 |
0.57 |
0.79 |
0.00 |
0.00 |
0.00 |
1 |
5 |
▇▁▁▁▁ |
Finding: there are 4 missing value in “Children”
hotel_bookings %>%
group_by(hotel) %>%
summarise(count=n())
## # A tibble: 2 × 2
## hotel count
## <chr> <int>
## 1 City Hotel 79330
## 2 Resort Hotel 40060
## Rows: 119,390
## Columns: 32
## $ hotel <chr> "Resort Hotel", "Resort Hotel", "Resort…
## $ is_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time <int> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year <int> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month <chr> "July", "July", "July", "July", "July",…
## $ arrival_date_week_number <int> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights <int> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults <int> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
## $ country <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
## $ market_segment <chr> "Direct", "Direct", "Direct", "Corporat…
## $ distribution_channel <chr> "Direct", "Direct", "Direct", "Corporat…
## $ is_repeated_guest <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
## $ assigned_room_type <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
## $ booking_changes <int> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type <chr> "No Deposit", "No Deposit", "No Deposit…
## $ agent <chr> "NULL", "NULL", "NULL", "304", "240", "…
## $ company <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
## $ days_in_waiting_list <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type <chr> "Transient", "Transient", "Transient", …
## $ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status <chr> "Check-Out", "Check-Out", "Check-Out", …
## $ reservation_status_date <chr> "2015-07-01", "2015-07-01", "2015-07-02…
Finding: there are “NULL” values in ‘agent’ & ‘company’
hotel_bookings %>%
group_by(hotel) %>%
summarise(count=n())
## # A tibble: 2 × 2
## hotel count
## <chr> <int>
## 1 City Hotel 79330
## 2 Resort Hotel 40060
hotel_bookings %>%
group_by(arrival_date_year) %>%
summarise(count=n())
## # A tibble: 3 × 2
## arrival_date_year count
## <int> <int>
## 1 2015 21996
## 2 2016 56707
## 3 2017 40687
hotel_bookings %>%
group_by(arrival_date_week_number) %>%
summarise(count=n())
## # A tibble: 53 × 2
## arrival_date_week_number count
## <int> <int>
## 1 1 1047
## 2 2 1218
## 3 3 1319
## 4 4 1487
## 5 5 1387
## 6 6 1508
## 7 7 2109
## 8 8 2216
## 9 9 2117
## 10 10 2149
## # ℹ 43 more rows
hotel_bookings %>%
group_by(meal) %>%
summarise(count=n())
## # A tibble: 5 × 2
## meal count
## <chr> <int>
## 1 BB 92310
## 2 FB 798
## 3 HB 14463
## 4 SC 10650
## 5 Undefined 1169
hotel_bookings %>%
group_by(country) %>%
summarise(count=n())
## # A tibble: 178 × 2
## country count
## <chr> <int>
## 1 ABW 2
## 2 AGO 362
## 3 AIA 1
## 4 ALB 12
## 5 AND 7
## 6 ARE 51
## 7 ARG 214
## 8 ARM 8
## 9 ASM 1
## 10 ATA 2
## # ℹ 168 more rows
hotel_bookings %>%
group_by(market_segment) %>%
summarise(count=n())
## # A tibble: 8 × 2
## market_segment count
## <chr> <int>
## 1 Aviation 237
## 2 Complementary 743
## 3 Corporate 5295
## 4 Direct 12606
## 5 Groups 19811
## 6 Offline TA/TO 24219
## 7 Online TA 56477
## 8 Undefined 2
hotel_bookings %>%
group_by(distribution_channel) %>%
summarise(count=n())
## # A tibble: 5 × 2
## distribution_channel count
## <chr> <int>
## 1 Corporate 6677
## 2 Direct 14645
## 3 GDS 193
## 4 TA/TO 97870
## 5 Undefined 5
hotel_bookings %>%
group_by(customer_type) %>%
summarise(count=n())
## # A tibble: 4 × 2
## customer_type count
## <chr> <int>
## 1 Contract 4076
## 2 Group 577
## 3 Transient 89613
## 4 Transient-Party 25124
hotel_bookings %>%
group_by(arrival_date_month) %>%
summarise(count=n())
## # A tibble: 12 × 2
## arrival_date_month count
## <chr> <int>
## 1 April 11089
## 2 August 13877
## 3 December 6780
## 4 February 8068
## 5 January 5929
## 6 July 12661
## 7 June 10939
## 8 March 9794
## 9 May 11791
## 10 November 6794
## 11 October 11160
## 12 September 10508