Introduction

Check out this Kaggle

Objective

Predict Housing Prices using 2 supervised ml algos:

  1. elastic net
  2. random forest

Get Data

a = read_csv('hotel_bookings.csv') %>%
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.character),
    where(is.factor),
    where(is.numeric)
  ) %>% filter(is_canceled == 0) #filter to non-canceled bookings

a$is_canceled = NULL
a$reservation_status_date = NULL

a %>% sample_n(10)

5 min EDA

viz missing

a %>% visdat::vis_dat(warn_large_data = FALSE)

check missing, na, inf

a %>% select(where(is.factor)) %>% funModeling::status() %>% arrange(-q_na)
a %>% select(where(is.numeric)) %>% funModeling::status() %>% arrange(-q_na)
a %>% select(where(is.numeric)) %>% funModeling::status() %>% arrange(-p_zeros)

Observations

  1. While there are no missing values, 2 vars unexpectedly have 0s
    • adr : how is the average daily rate (price) $0.00 ?
    • adults: how is it that children are making bookings ?

Wrangling / Cleaning

EDA: nom vars

names

(nom.vars = a %>% select(where(is.factor)) %>% colnames %>% as.character)
##  [1] "agent"                "arrival_date_month"   "assigned_room_type"  
##  [4] "company"              "country"              "customer_type"       
##  [7] "deposit_type"         "distribution_channel" "hotel"               
## [10] "market_segment"       "meal"                 "reservation_status"  
## [13] "reserved_room_type"

check missing

a %>% select(nom.vars) %>% miss_var_summary
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nom.vars)` instead of `nom.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

sample

a %>% select(nom.vars) %>% sample_n(5)

skim

a %>% select(nom.vars) %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 75166
Number of columns 13
_______________________
Column type frequency:
factor 13
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
agent 0 1 FALSE 315 9: 18697, NUL: 12310, 240: 8438, 7: 3065
arrival_date_month 0 1 FALSE 12 Aug: 8638, Jul: 7919, May: 7114, Oct: 6914
assigned_room_type 0 1 FALSE 10 A: 41105, D: 18960, E: 5838, F: 2824
company 0 1 FALSE 332 NUL: 69560, 40: 850, 223: 665, 45: 222
country 0 1 FALSE 166 PRT: 21071, GBR: 9676, FRA: 8481, ESP: 6391
customer_type 0 1 FALSE 4 Tra: 53099, Tra: 18735, Con: 2814, Gro: 518
deposit_type 0 1 FALSE 3 No : 74947, Ref: 126, Non: 93
distribution_channel 0 1 FALSE 5 TA/: 57718, Dir: 12088, Cor: 5203, GDS: 156
hotel 0 1 FALSE 2 Cit: 46228, Res: 28938
market_segment 0 1 FALSE 7 Onl: 35738, Off: 15908, Dir: 10672, Gro: 7714
meal 0 1 FALSE 5 BB: 57800, HB: 9479, SC: 6684, Und: 883
reservation_status 0 1 FALSE 1 Che: 75166, Can: 0, No-: 0
reserved_room_type 0 1 FALSE 9 A: 52364, D: 13099, E: 4621, F: 2017

viz: level counts distribution

a %>% select(nom.vars) %>% map(n_unique) %>% as.data.frame.list %>% pivot_longer(everything()) %>% mutate(name = fct_reorder(name, value)) %>% plot_ly(x = ~value, y = ~name, color = ~name) %>% hide_legend() %>% layout(title = 'factor vars level counts', xaxis = list(title = 'count'), yaxis = list(title = ''))
a %>% select(nom.vars) %>% select(-company, -agent, -country) %>% funModeling::freq()

##    arrival_date_month frequency percentage cumulative_perc
## 1              August      8638      11.49           11.49
## 2                July      7919      10.54           22.03
## 3                 May      7114       9.46           31.49
## 4             October      6914       9.20           40.69
## 5               March      6645       8.84           49.53
## 6               April      6565       8.73           58.26
## 7                June      6404       8.52           66.78
## 8           September      6392       8.50           75.28
## 9            February      5372       7.15           82.43
## 10           November      4672       6.22           88.65
## 11           December      4409       5.87           94.52
## 12            January      4122       5.48          100.00

##    assigned_room_type frequency percentage cumulative_perc
## 1                   A     41105      54.69           54.69
## 2                   D     18960      25.22           79.91
## 3                   E      5838       7.77           87.68
## 4                   F      2824       3.76           91.44
## 5                   C      1929       2.57           94.01
## 6                   G      1773       2.36           96.37
## 7                   B      1651       2.20           98.57
## 8                   H       461       0.61           99.18
## 9                   I       358       0.48           99.66
## 10                  K       267       0.36          100.00

##     customer_type frequency percentage cumulative_perc
## 1       Transient     53099      70.64           70.64
## 2 Transient-Party     18735      24.92           95.56
## 3        Contract      2814       3.74           99.30
## 4           Group       518       0.69          100.00

##   deposit_type frequency percentage cumulative_perc
## 1   No Deposit     74947      99.71           99.71
## 2   Refundable       126       0.17           99.88
## 3   Non Refund        93       0.12          100.00

##   distribution_channel frequency percentage cumulative_perc
## 1                TA/TO     57718      76.79           76.79
## 2               Direct     12088      16.08           92.87
## 3            Corporate      5203       6.92           99.79
## 4                  GDS       156       0.21          100.00
## 5            Undefined         1       0.00          100.00

##          hotel frequency percentage cumulative_perc
## 1   City Hotel     46228       61.5            61.5
## 2 Resort Hotel     28938       38.5           100.0

##   market_segment frequency percentage cumulative_perc
## 1      Online TA     35738      47.55           47.55
## 2  Offline TA/TO     15908      21.16           68.71
## 3         Direct     10672      14.20           82.91
## 4         Groups      7714      10.26           93.17
## 5      Corporate      4303       5.72           98.89
## 6  Complementary       646       0.86           99.75
## 7       Aviation       185       0.25          100.00

##        meal frequency percentage cumulative_perc
## 1        BB     57800      76.90           76.90
## 2        HB      9479      12.61           89.51
## 3        SC      6684       8.89           98.40
## 4 Undefined       883       1.17           99.57
## 5        FB       320       0.43          100.00

##   reservation_status frequency percentage cumulative_perc
## 1          Check-Out     75166        100             100

##   reserved_room_type frequency percentage cumulative_perc
## 1                  A     52364      69.66           69.66
## 2                  D     13099      17.43           87.09
## 3                  E      4621       6.15           93.24
## 4                  F      2017       2.68           95.92
## 5                  G      1331       1.77           97.69
## 6                  B       750       1.00           98.69
## 7                  C       624       0.83           99.52
## 8                  H       356       0.47           99.99
## 9                  L         4       0.01          100.00
## [1] "Variables processed: arrival_date_month, assigned_room_type, customer_type, deposit_type, distribution_channel, hotel, market_segment, meal, reservation_status, reserved_room_type"

Observations:

  1. Makes sense that there are more bookings during the summer months due to vacation
  2. For all factors, consider collapsing/lumping levels that make up less than 2% of the dataset

viz: mosaic plots: ggpairs

#documentation: https://mran.microsoft.com/snapshot/2016-01-12/web/packages/GGally/vignettes/ggpairs.html

a %>% select(nom.vars) %>% 
  GGally::ggpairs(
    columns = c(nom.vars[8], nom.vars[9]),
    mapping = aes(color = eval(as.name(nom.vars[9])))
)

#documentation: https://mran.microsoft.com/snapshot/2016-01-12/web/packages/GGally/vignettes/ggpairs.html

a %>% select(nom.vars) %>% 
  GGally::ggpairs(
    columns = c(nom.vars[10], nom.vars[9]),
    mapping = aes(color = eval(as.name(nom.vars[9])))
)

Observations:

  1. For City Hotels, the ‘Online Travel Agent’ market segment makes up a much larger composition than that for Resort Hotels
  2. Bookings from the ‘Aviation’ market segment are non-existent for Resort Hotels

EDA: num vars

names

(num.vars = a %>% select(where(is.numeric)) %>% colnames %>% as.character)
##  [1] "adr"                            "adults"                        
##  [3] "arrival_date_day_of_month"      "arrival_date_week_number"      
##  [5] "arrival_date_year"              "babies"                        
##  [7] "booking_changes"                "children"                      
##  [9] "days_in_waiting_list"           "is_repeated_guest"             
## [11] "lead_time"                      "previous_bookings_not_canceled"
## [13] "previous_cancellations"         "required_car_parking_spaces"   
## [15] "stays_in_week_nights"           "stays_in_weekend_nights"       
## [17] "total_of_special_requests"

check missing

a %>% select(num.vars) %>% miss_var_summary
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(num.vars)` instead of `num.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

sample

a %>% select(num.vars) %>% sample_n(5)

skim

a %>% select(num.vars) %>% skim_without_charts()
Data summary
Name Piped data
Number of rows 75166
Number of columns 17
_______________________
Column type frequency:
numeric 17
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
adr 0 1 99.99 49.21 -6.38 67.5 92.5 125 510
adults 0 1 1.83 0.51 0.00 2.0 2.0 2 4
arrival_date_day_of_month 0 1 15.84 8.78 1.00 8.0 16.0 23 31
arrival_date_week_number 0 1 27.08 13.90 1.00 16.0 28.0 38 53
arrival_date_year 0 1 2016.15 0.70 2015.00 2016.0 2016.0 2017 2017
babies 0 1 0.01 0.11 0.00 0.0 0.0 0 10
booking_changes 0 1 0.29 0.74 0.00 0.0 0.0 0 21
children 0 1 0.10 0.39 0.00 0.0 0.0 0 3
days_in_waiting_list 0 1 1.59 14.78 0.00 0.0 0.0 0 379
is_repeated_guest 0 1 0.04 0.20 0.00 0.0 0.0 0 1
lead_time 0 1 79.98 91.11 0.00 9.0 45.0 124 737
previous_bookings_not_canceled 0 1 0.20 1.81 0.00 0.0 0.0 0 72
previous_cancellations 0 1 0.02 0.27 0.00 0.0 0.0 0 13
required_car_parking_spaces 0 1 0.10 0.30 0.00 0.0 0.0 0 8
stays_in_week_nights 0 1 2.46 1.92 0.00 1.0 2.0 3 50
stays_in_weekend_nights 0 1 0.93 0.99 0.00 0.0 1.0 2 19
total_of_special_requests 0 1 0.71 0.83 0.00 0.0 1.0 1 5

viz: distribution - hist

a %>% select(num.vars) %>% DataExplorer::plot_histogram(
  #scale_x = 'log10',
  geom_histogram_args = list(bins = 50L),
  ncol = 2, nrow = 2
)

the peaks/troughs for ‘arrival_date_week_number’ likely correspond to seasonal demand

a %>% plot_ly(x = ~arrival_date_week_number) %>% add_histogram()

viz: distribution - density

a %>% select(num.vars) %>% DataExplorer::plot_density(
  scale_x = 'log10',
  #geom_histogram_args = list(bins = 50L),
  ncol = 2, nrow = 2
)

viz: outliers

a %>% select(num.vars) %>% dlookr::plot_outlier()

viz: normality

a %>% select(num.vars) %>% dlookr::plot_normality()
## Warning in log(x): NaNs produced
## Warning in sqrt(x): NaNs produced

viz: normality: outcome var only

a %>% select(adr) %>% dlookr::plot_normality()
## Warning in log(x): NaNs produced
## Warning in sqrt(x): NaNs produced

consider log transforming

viz: correlations

a %>% select(num.vars) %>% visdat::vis_cor()

#a %>% select(num.vars) %>% dlookr::plot_correlate()

a %>% select(num.vars) %>% GGally::ggcorr(low = 'red', high = 'darkgreen', label = TRUE)

Observations:

  1. positive cor between ‘previous bookings not canceled’ and ‘is repeated guest’

EDA: nom/num vars

target.var = 'adr'

viz: cor w/target - pearson

a %>% select(num.vars) %>% funModeling::correlation_table(target = target.var)

viz: cross plot - input/target distribution

### useful when target var is a binary var
nom.vars.eXc.many.levels = a %>% select(nom.vars) %>% select(-company, -agent, -country) %>% colnames

a %>% funModeling::cross_plot(
  input = nom.vars.eXc.many.levels,
  target = 'hotel'
  )

a %>% funModeling::cross_plot(
  input = num.vars,
  target = 'hotel'
  )

quick analysis: binary target

#This function is used to analyze data when we need to reduce variable cardinality in predictive modeling.
#works in conjunction with 'cross_plot';
levels(a$hotel)

a %>% funModeling::categ_analysis(input = nom.vars[8], target = 'hotel') #mean_target, in reference to %Resort Hotel
a %>% funModeling::categ_analysis(input = nom.vars[10], target = 'hotel') #mean_target, in reference to %Resort Hotel

Preprocess Data

1) split data

2) create recipe spec

#order reference: https://recipes.tidymodels.org/articles/Ordering.html

library(tidymodels)

rec.en = train %>% recipe(adr ~ . ) %>% 
  step_filter(adr > 0) %>% 
  step_log(adr, base = 10, offset = 0.01) %>% 
  #step_rm() %>% #excessive nas
  #step_bagimpute(all_numeric(),-all_outcomes()) %>% 
  #step_knnimpute(all_nominal(),-all_outcomes()) %>%
  step_corr(all_numeric(),-all_outcomes()) %>%
  step_dummy(all_nominal(), one_hot = TRUE) %>%
  step_nzv(all_predictors(),-all_outcomes()) %>%
  step_zv(all_predictors(),-all_outcomes()) %>% 
  step_normalize(all_numeric(),-all_outcomes())

rec.en %>% tidy
#----------------------------

rec.rf = train %>% recipe(adr ~ . ) %>% 
  step_filter(adr > 0) %>% 
  step_log(adr, base = 10, offset = 0.01) %>% 
  #step_rm() %>% #excessive nas
  #step_bagimpute(all_numeric(),-all_outcomes()) %>% 
  #step_knnimpute(all_nominal(),-all_outcomes()) %>%
  step_corr(all_numeric(),-all_outcomes()) %>%
  #step_dummy(all_nominal(), one_hot = TRUE) %>%  #There are new levels in a factor
  step_nzv(all_predictors(),-all_outcomes()) %>%
  step_zv(all_predictors(),-all_outcomes()) %>% 
  step_normalize(all_numeric(),-all_outcomes())

rec.rf %>% tidy

3) preprocess

check missing

4) create model spec

mdl.en = parsnip::linear_reg(
  penalty = tune(),
  mixture = tune() #lasso/ridge mix
) %>% 
  set_mode('regression') %>% 
  set_engine('glmnet')

#----------------------------

mdl.rf = parsnip::rand_forest(
  trees = 150,
  min_n = tune(), #min number of observations at terminal node
  mtry = tune() #number of vars to randomly subset at each node
) %>% 
  set_mode('regression') %>% 
  set_engine('ranger', importance = 'impurity_corrected')

5) create workflow spec

wf.en = workflow() %>% 
  add_recipe(rec.en) %>% 
  add_model(mdl.en)

#----------------------------

wf.rf = workflow() %>% 
  add_recipe(rec.rf) %>% 
  add_model(mdl.rf)

6) execute workflow on vfold using auto hp tuning

tg.en = tune_grid(
  object = wf.en,
  resamples = vfold, 
  grid = 5 #Create a tuning grid AUTOMATICALLY
)

tg.en %>% collect_metrics()
#----------------------------
doParallel::registerDoParallel() #use parallel processing
set.seed(345)

tg.rf = tune_grid(
  object = wf.rf,
  resamples = vfold, 
  grid = 5 #Create a tuning grid AUTOMATICALLY
)

tg.rf %>% collect_metrics()

viz evaluation metrics

ggplotly(
tg.en %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, penalty, mixture) %>%
  pivot_longer(penalty:mixture,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(title = 'Elastic Net RMSE by Hyperparameter', x = NULL, y = '')
)
tg.en %>% show_best('rmse')
#----------------------------

ggplotly(
tg.rf %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  select(mean, min_n, mtry) %>%
  pivot_longer(min_n:mtry,
    values_to = "value",
    names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE, size = 3) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(title = 'Random Forest RMSE by Hyperparameter', x = NULL, y = '')
)
tg.rf %>% show_best('rmse')

7) select best hps

(best.hps.en = tg.en %>% select_best('rmse'))
#----------------------------
(best.hps.rf = tg.rf %>% select_best('rmse'))

8) finalize workflow & fit model

wf.en.fin = wf.en %>% 
  #1) finalize wf (recipe, model w/previously unknown hps) using best hps
  finalize_workflow(best.hps.en) %>%
  #2) fit on entire train
  fit(train)

#----------------------------

wf.rf.fin = wf.rf %>% 
  #1) finalize wf (recipe, model w/previously unknown hps) using best hps
  finalize_workflow(best.hps.rf) %>%
  #2) fit on entire train
  fit(train)

9) check vip

wf.en.fin %>% pull_workflow_fit() %>% vip(geom = 'point') + labs(title = 'ENET var importance')

#wf.en.fin %>% pull_workflow_fit() %>% vi
wf.rf.fin %>% pull_workflow_fit() %>% vip(geom = 'point') + labs(title = 'RF var importance')

#wf.rf.fin %>% pull_workflow_fit() %>% vi

10) make preds on test, then evaluate performance

wf.en.fin %>%
  last_fit(split) %>%  #emulates the process where, after determining the best model, the final fit on the entire training set is needed and is then evaluated on the test set.
  collect_predictions %>% 
  mutate(.pred = 10 ^ .pred, adr = 10 ^ adr) %>% # 'undo' the log10 transform
  select(.pred, adr) %>%
  #'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
  metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)
#----------------------------

wf.rf.fin %>%
  last_fit(split) %>%  #emulates the process where, after determining the best model, the final fit on the entire training set is needed and is then evaluated on the test set.
  collect_predictions %>%
  mutate(.pred = 10 ^ .pred, adr = 10 ^ adr) %>% # 'undo' the log10 transform
  select(.pred, adr) %>%
  #'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
  metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)