Setting up my R environment by loading the ‘tidyverse’, ‘skimr’, and ‘janitor’ packages
Import data
hotel_bookings <- read_csv("~/Desktop/hotel_bookings.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Inspect data
bookings_df <- read_csv("~/Desktop/hotel_bookings.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (13): hotel, arrival_date_month, meal, country, market_segment, distrib...
## dbl  (18): is_canceled, lead_time, arrival_date_year, arrival_date_week_numb...
## date  (1): reservation_status_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(bookings_df)
## # A tibble: 6 × 32
##   hotel        is_canceled lead_time arrival_date_ye… arrival_date_mo… arrival_date_we…
##   <chr>              <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 Resort Hotel           0       342             2015 July                           27
## 2 Resort Hotel           0       737             2015 July                           27
## 3 Resort Hotel           0         7             2015 July                           27
## 4 Resort Hotel           0        13             2015 July                           27
## 5 Resort Hotel           0        14             2015 July                           27
## 6 Resort Hotel           0        14             2015 July                           27
## # … with 26 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, …
str(bookings_df)
## spec_tbl_df [119,390 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ hotel                         : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
##  $ is_canceled                   : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
##  $ lead_time                     : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
##  $ arrival_date_year             : num [1:119390] 2015 2015 2015 2015 2015 ...
##  $ arrival_date_month            : chr [1:119390] "July" "July" "July" "July" ...
##  $ arrival_date_week_number      : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
##  $ arrival_date_day_of_month     : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
##  $ stays_in_weekend_nights       : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ stays_in_week_nights          : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
##  $ adults                        : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
##  $ children                      : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ babies                        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ meal                          : chr [1:119390] "BB" "BB" "BB" "BB" ...
##  $ country                       : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
##  $ market_segment                : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
##  $ distribution_channel          : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
##  $ is_repeated_guest             : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_cancellations        : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ reserved_room_type            : chr [1:119390] "C" "C" "A" "A" ...
##  $ assigned_room_type            : chr [1:119390] "C" "C" "C" "A" ...
##  $ booking_changes               : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
##  $ deposit_type                  : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
##  $ agent                         : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
##  $ company                       : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
##  $ days_in_waiting_list          : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ customer_type                 : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
##  $ adr                           : num [1:119390] 0 0 75 75 98 ...
##  $ required_car_parking_spaces   : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
##  $ total_of_special_requests     : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
##  $ reservation_status            : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
##  $ reservation_status_date       : Date[1:119390], format: "2015-07-01" "2015-07-01" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   hotel = col_character(),
##   ..   is_canceled = col_double(),
##   ..   lead_time = col_double(),
##   ..   arrival_date_year = col_double(),
##   ..   arrival_date_month = col_character(),
##   ..   arrival_date_week_number = col_double(),
##   ..   arrival_date_day_of_month = col_double(),
##   ..   stays_in_weekend_nights = col_double(),
##   ..   stays_in_week_nights = col_double(),
##   ..   adults = col_double(),
##   ..   children = col_double(),
##   ..   babies = col_double(),
##   ..   meal = col_character(),
##   ..   country = col_character(),
##   ..   market_segment = col_character(),
##   ..   distribution_channel = col_character(),
##   ..   is_repeated_guest = col_double(),
##   ..   previous_cancellations = col_double(),
##   ..   previous_bookings_not_canceled = col_double(),
##   ..   reserved_room_type = col_character(),
##   ..   assigned_room_type = col_character(),
##   ..   booking_changes = col_double(),
##   ..   deposit_type = col_character(),
##   ..   agent = col_character(),
##   ..   company = col_character(),
##   ..   days_in_waiting_list = col_double(),
##   ..   customer_type = col_character(),
##   ..   adr = col_double(),
##   ..   required_car_parking_spaces = col_double(),
##   ..   total_of_special_requests = col_double(),
##   ..   reservation_status = col_character(),
##   ..   reservation_status_date = col_date(format = "")
##   .. )
##  - attr(*, "problems")=<externalptr>
glimpse(bookings_df)
## Rows: 119,390
## Columns: 32
## $ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
## $ is_canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
## $ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
## $ arrival_date_year              <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
## $ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
## $ arrival_date_week_number       <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
## $ arrival_date_day_of_month      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
## $ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
## $ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
## $ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
## $ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
## $ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
## $ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
## $ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
## $ agent                          <chr> "NULL", "NULL", "NULL", "304", "240", "…
## $ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ customer_type                  <chr> "Transient", "Transient", "Transient", …
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
## $ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
## $ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
colnames(bookings_df)
##  [1] "hotel"                          "is_canceled"                   
##  [3] "lead_time"                      "arrival_date_year"             
##  [5] "arrival_date_month"             "arrival_date_week_number"      
##  [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
##  [9] "stays_in_week_nights"           "adults"                        
## [11] "children"                       "babies"                        
## [13] "meal"                           "country"                       
## [15] "market_segment"                 "distribution_channel"          
## [17] "is_repeated_guest"              "previous_cancellations"        
## [19] "previous_bookings_not_canceled" "reserved_room_type"            
## [21] "assigned_room_type"             "booking_changes"               
## [23] "deposit_type"                   "agent"                         
## [25] "company"                        "days_in_waiting_list"          
## [27] "customer_type"                  "adr"                           
## [29] "required_car_parking_spaces"    "total_of_special_requests"     
## [31] "reservation_status"             "reservation_status_date"
skim_without_charts(bookings_df)
Data summary
Name bookings_df
Number of rows 119390
Number of columns 32
_______________________
Column type frequency:
character 13
Date 1
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hotel 0 1 10 12 0 2 0
arrival_date_month 0 1 3 9 0 12 0
meal 0 1 2 9 0 5 0
country 0 1 2 4 0 178 0
market_segment 0 1 6 13 0 8 0
distribution_channel 0 1 3 9 0 5 0
reserved_room_type 0 1 1 1 0 10 0
assigned_room_type 0 1 1 1 0 12 0
deposit_type 0 1 10 10 0 3 0
agent 0 1 1 4 0 334 0
company 0 1 1 4 0 353 0
customer_type 0 1 5 15 0 4 0
reservation_status 0 1 7 9 0 3 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
is_canceled 0 1 0.37 0.48 0.00 0.00 0.00 1 1
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737
arrival_date_year 0 1 2016.16 0.71 2015.00 2016.00 2016.00 2017 2017
arrival_date_week_number 0 1 27.17 13.61 1.00 16.00 28.00 38 53
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55
children 4 1 0.10 0.40 0.00 0.00 0.00 0 10
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10
is_repeated_guest 0 1 0.03 0.18 0.00 0.00 0.00 0 1
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5
Cleaning data

Notes: Focus on in the following variables: ‘hotel’, ‘is_canceled’, and ‘lead_time’. Then create a new data frame with just those columns as called it ‘trimmed_df’ by adding the variable names to this code chunk, and rename the variable ‘hotel’ to be named ‘hotel_type’ to be easy understand

trimmed_df <- bookings_df %>%
  select(hotel, is_canceled, lead_time)
trimmed_df %>%
  select(hotel, is_canceled, lead_time) %>%
  rename(hotel_type = hotel)
## # A tibble: 119,390 × 3
##    hotel_type   is_canceled lead_time
##    <chr>              <dbl>     <dbl>
##  1 Resort Hotel           0       342
##  2 Resort Hotel           0       737
##  3 Resort Hotel           0         7
##  4 Resort Hotel           0        13
##  5 Resort Hotel           0        14
##  6 Resort Hotel           0        14
##  7 Resort Hotel           0         0
##  8 Resort Hotel           0         9
##  9 Resort Hotel           1        85
## 10 Resort Hotel           1        75
## # … with 119,380 more rows
head(trimmed_df)
## # A tibble: 6 × 3
##   hotel        is_canceled lead_time
##   <chr>              <dbl>     <dbl>
## 1 Resort Hotel           0       342
## 2 Resort Hotel           0       737
## 3 Resort Hotel           0         7
## 4 Resort Hotel           0        13
## 5 Resort Hotel           0        14
## 6 Resort Hotel           0        14

Notes: combine the arrive month and year into one column using the unite() function

trimmed_df <- bookings_df %>%
  select(arrival_date_year, arrival_date_month) %>%
  unite(arrival_month_year, c("arrival_date_month", 
                              "arrival_date_year"), 
                              sep = " ")
head(trimmed_df)
## # A tibble: 6 × 1
##   arrival_month_year
##   <chr>             
## 1 July 2015         
## 2 July 2015         
## 3 July 2015         
## 4 July 2015         
## 5 July 2015         
## 6 July 2015

Notes: create a new column that summed up all the adults, children, and babies on a reservation for the total number of people

trimmed_df <- bookings_df %>%
  mutate(guests = adults + children + babies)
head(trimmed_df)
## # A tibble: 6 × 33
##   hotel        is_canceled lead_time arrival_date_ye… arrival_date_mo… arrival_date_we…
##   <chr>              <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 Resort Hotel           0       342             2015 July                           27
## 2 Resort Hotel           0       737             2015 July                           27
## 3 Resort Hotel           0         7             2015 July                           27
## 4 Resort Hotel           0        13             2015 July                           27
## 5 Resort Hotel           0        14             2015 July                           27
## 6 Resort Hotel           0        14             2015 July                           27
## # … with 27 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, …

Notes: represent average leading time

trimmed_df <- bookings_df %>%
  summarize(number_canceled = sum(is_canceled),
            average_lead_time = mean(lead_time))
head(trimmed_df)
## # A tibble: 1 × 2
##   number_canceled average_lead_time
##             <dbl>             <dbl>
## 1           44224              104.
Manipulating data

Notes: arrange the data by most lead time to least lead time

arrange(hotel_bookings, lead_time)
## # A tibble: 119,390 × 32
##    hotel        is_canceled lead_time arrival_date_year arrival_date_month
##    <chr>              <dbl>     <dbl>             <dbl> <chr>             
##  1 Resort Hotel           0         0              2015 July              
##  2 Resort Hotel           0         0              2015 July              
##  3 Resort Hotel           0         0              2015 July              
##  4 Resort Hotel           0         0              2015 July              
##  5 Resort Hotel           0         0              2015 July              
##  6 Resort Hotel           0         0              2015 July              
##  7 Resort Hotel           0         0              2015 July              
##  8 Resort Hotel           0         0              2015 July              
##  9 Resort Hotel           0         0              2015 July              
## 10 Resort Hotel           0         0              2015 July              
## # … with 119,380 more rows, and 27 more variables:
## #   arrival_date_week_number <dbl>, arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, …

Notes: need to specifically tell it when to order by desending order

arrange(hotel_bookings, desc(lead_time))
## # A tibble: 119,390 × 32
##    hotel        is_canceled lead_time arrival_date_year arrival_date_month
##    <chr>              <dbl>     <dbl>             <dbl> <chr>             
##  1 Resort Hotel           0       737              2015 July              
##  2 Resort Hotel           0       709              2016 February          
##  3 City Hotel             1       629              2017 March             
##  4 City Hotel             1       629              2017 March             
##  5 City Hotel             1       629              2017 March             
##  6 City Hotel             1       629              2017 March             
##  7 City Hotel             1       629              2017 March             
##  8 City Hotel             1       629              2017 March             
##  9 City Hotel             1       629              2017 March             
## 10 City Hotel             1       629              2017 March             
## # … with 119,380 more rows, and 27 more variables:
## #   arrival_date_week_number <dbl>, arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, …

Notes: store the arranged data in a data frame named ‘hotel_bookings_v2’

head(hotel_bookings)
## # A tibble: 6 × 32
##   hotel        is_canceled lead_time arrival_date_ye… arrival_date_mo… arrival_date_we…
##   <chr>              <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 Resort Hotel           0       342             2015 July                           27
## 2 Resort Hotel           0       737             2015 July                           27
## 3 Resort Hotel           0         7             2015 July                           27
## 4 Resort Hotel           0        13             2015 July                           27
## 5 Resort Hotel           0        14             2015 July                           27
## 6 Resort Hotel           0        14             2015 July                           27
## # … with 26 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, …
hotel_bookings_v2 <- arrange(hotel_bookings, desc(lead_time))
head(hotel_bookings_v2)
## # A tibble: 6 × 32
##   hotel        is_canceled lead_time arrival_date_ye… arrival_date_mo… arrival_date_we…
##   <chr>              <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 Resort Hotel           0       737             2015 July                           27
## 2 Resort Hotel           0       709             2016 February                        9
## 3 City Hotel             1       629             2017 March                          13
## 4 City Hotel             1       629             2017 March                          13
## 5 City Hotel             1       629             2017 March                          13
## 6 City Hotel             1       629             2017 March                          13
## # … with 26 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, …

Notes: getting to know maximum and minimum lead time, then getting to know what the average lead time for booking is because I want to know how early I should run promotions for hotel rooms as included as v2 data set as well

max(hotel_bookings$lead_time)
## [1] 737
min(hotel_bookings$lead_time)
## [1] 0
mean(hotel_bookings_v2$lead_time)
## [1] 104.0114
mean(hotel_bookings$lead_time)
## [1] 104.0114

Notes: getting to know what the average lead time before booking is for just city hotels

hotel_bookings_city <- filter(hotel_bookings, hotel_bookings$hotel=="City Hotel")
head(hotel_bookings_city)
## # A tibble: 6 × 32
##   hotel      is_canceled lead_time arrival_date_ye… arrival_date_mo… arrival_date_we…
##   <chr>            <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 City Hotel           0         6             2015 July                           27
## 2 City Hotel           1        88             2015 July                           27
## 3 City Hotel           1        65             2015 July                           27
## 4 City Hotel           1        92             2015 July                           27
## 5 City Hotel           1       100             2015 July                           27
## 6 City Hotel           1        79             2015 July                           27
## # … with 26 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, …
mean(hotel_bookings_city$lead_time)
## [1] 109.7357

Notes: how the information different among resort hotels and city hotels

hotel_summary <- 
  hotel_bookings %>% 
  group_by(hotel) %>% 
  summarize(average_lead_time=mean(lead_time),
            min_lead_time=min(lead_time),
            max_lead_time=max(lead_time))
head(hotel_summary)
## # A tibble: 2 × 4
##   hotel        average_lead_time min_lead_time max_lead_time
##   <chr>                    <dbl>         <dbl>         <dbl>
## 1 City Hotel               110.              0           629
## 2 Resort Hotel              92.7             0           737

Aesthetics and Visualization data

Notes: install and load the ‘ggplot2’ package

Notes: using a bar chart getting to know what distribution type has the most number of bookings

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel))

Notes: driving deeper into bar chart in order to know if the number of bookings for each distribution type is different depending on wether or not there was a deposit or what market segment they represent

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel, fill=deposit_type))

ggplot(hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel, fill=market_segment))

Notes: facets galore as in order to create separate charts for each deposit type and market segment to help stakeholder understand the differents more clearly

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel)) + 
  facet_wrap(~deposit_type)

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel)) + 
  facet_wrap(~market_segment)

Notes: using ‘facet_grid’ function that will include plots even if they are empty

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel)) + 
  facet_grid(~deposit_type)

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel)) + 
  facet_grid(~market_segment)

Notes: putting all into one chart and explore the differences by deposit type and market segment

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = distribution_channel)) + 
  facet_wrap(~deposit_type~market_segment)

Filters and Facets data

Notes: create a bar chart showing each hotel type and market segment by using different colors to represent each market segment

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes(x = hotel, fill = market_segment))

Notes: decide to use the facet_wrap() function to create a separte plot for each market segment

ggplot(data = hotel_bookings) + 
  geom_bar(mapping = aes( x = hotel)) + 
  facet_wrap(~market_segment)

Notes: load ’tidyverse’package then filtering a dataset to just city hotels that are online TA

onlineta_city_hotels <- filter(hotel_bookings, 
                           (hotel=="City Hotel" & 
                             hotel_bookings$market_segment=="Online TA"))

Notes: plot the data to showing the relationship of variables between lead_time and stays_in_week_nights

ggplot(data = onlineta_city_hotels) + 
  geom_point(mapping = aes(x = lead_time, y = stays_in_week_nights))

Notes: create a plot to determine if people with children book hotel rooms in advance (on x-axis, the plot shows how far in advance a booking is made, with the bookings furthest to the right happening the most in advance; on the y-axis it shows how many children there are in a party)

ggplot(data = hotel_bookings) +
  geom_point(mapping = aes(x = lead_time, y = children))