Predict Housing Prices using 2 supervised ml algos:
a = read_csv('hotel_bookings.csv') %>%
clean_names() %>%
mutate(across(where(is.character), factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.character),
where(is.factor),
where(is.numeric)
) %>% filter(is_canceled == 0) #filter to non-canceled bookings
a$is_canceled = NULL
a$reservation_status_date = NULL
a %>% sample_n(10)Observations
## [1] "agent" "arrival_date_month" "assigned_room_type"
## [4] "company" "country" "customer_type"
## [7] "deposit_type" "distribution_channel" "hotel"
## [10] "market_segment" "meal" "reservation_status"
## [13] "reserved_room_type"
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(nom.vars)` instead of `nom.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
| Name | Piped data |
| Number of rows | 75166 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| factor | 13 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| agent | 0 | 1 | FALSE | 315 | 9: 18697, NUL: 12310, 240: 8438, 7: 3065 |
| arrival_date_month | 0 | 1 | FALSE | 12 | Aug: 8638, Jul: 7919, May: 7114, Oct: 6914 |
| assigned_room_type | 0 | 1 | FALSE | 10 | A: 41105, D: 18960, E: 5838, F: 2824 |
| company | 0 | 1 | FALSE | 332 | NUL: 69560, 40: 850, 223: 665, 45: 222 |
| country | 0 | 1 | FALSE | 166 | PRT: 21071, GBR: 9676, FRA: 8481, ESP: 6391 |
| customer_type | 0 | 1 | FALSE | 4 | Tra: 53099, Tra: 18735, Con: 2814, Gro: 518 |
| deposit_type | 0 | 1 | FALSE | 3 | No : 74947, Ref: 126, Non: 93 |
| distribution_channel | 0 | 1 | FALSE | 5 | TA/: 57718, Dir: 12088, Cor: 5203, GDS: 156 |
| hotel | 0 | 1 | FALSE | 2 | Cit: 46228, Res: 28938 |
| market_segment | 0 | 1 | FALSE | 7 | Onl: 35738, Off: 15908, Dir: 10672, Gro: 7714 |
| meal | 0 | 1 | FALSE | 5 | BB: 57800, HB: 9479, SC: 6684, Und: 883 |
| reservation_status | 0 | 1 | FALSE | 1 | Che: 75166, Can: 0, No-: 0 |
| reserved_room_type | 0 | 1 | FALSE | 9 | A: 52364, D: 13099, E: 4621, F: 2017 |
a %>% select(nom.vars) %>% map(n_unique) %>% as.data.frame.list %>% pivot_longer(everything()) %>% mutate(name = fct_reorder(name, value)) %>% plot_ly(x = ~value, y = ~name, color = ~name) %>% hide_legend() %>% layout(title = 'factor vars level counts', xaxis = list(title = 'count'), yaxis = list(title = ''))## arrival_date_month frequency percentage cumulative_perc
## 1 August 8638 11.49 11.49
## 2 July 7919 10.54 22.03
## 3 May 7114 9.46 31.49
## 4 October 6914 9.20 40.69
## 5 March 6645 8.84 49.53
## 6 April 6565 8.73 58.26
## 7 June 6404 8.52 66.78
## 8 September 6392 8.50 75.28
## 9 February 5372 7.15 82.43
## 10 November 4672 6.22 88.65
## 11 December 4409 5.87 94.52
## 12 January 4122 5.48 100.00
## assigned_room_type frequency percentage cumulative_perc
## 1 A 41105 54.69 54.69
## 2 D 18960 25.22 79.91
## 3 E 5838 7.77 87.68
## 4 F 2824 3.76 91.44
## 5 C 1929 2.57 94.01
## 6 G 1773 2.36 96.37
## 7 B 1651 2.20 98.57
## 8 H 461 0.61 99.18
## 9 I 358 0.48 99.66
## 10 K 267 0.36 100.00
## customer_type frequency percentage cumulative_perc
## 1 Transient 53099 70.64 70.64
## 2 Transient-Party 18735 24.92 95.56
## 3 Contract 2814 3.74 99.30
## 4 Group 518 0.69 100.00
## deposit_type frequency percentage cumulative_perc
## 1 No Deposit 74947 99.71 99.71
## 2 Refundable 126 0.17 99.88
## 3 Non Refund 93 0.12 100.00
## distribution_channel frequency percentage cumulative_perc
## 1 TA/TO 57718 76.79 76.79
## 2 Direct 12088 16.08 92.87
## 3 Corporate 5203 6.92 99.79
## 4 GDS 156 0.21 100.00
## 5 Undefined 1 0.00 100.00
## hotel frequency percentage cumulative_perc
## 1 City Hotel 46228 61.5 61.5
## 2 Resort Hotel 28938 38.5 100.0
## market_segment frequency percentage cumulative_perc
## 1 Online TA 35738 47.55 47.55
## 2 Offline TA/TO 15908 21.16 68.71
## 3 Direct 10672 14.20 82.91
## 4 Groups 7714 10.26 93.17
## 5 Corporate 4303 5.72 98.89
## 6 Complementary 646 0.86 99.75
## 7 Aviation 185 0.25 100.00
## meal frequency percentage cumulative_perc
## 1 BB 57800 76.90 76.90
## 2 HB 9479 12.61 89.51
## 3 SC 6684 8.89 98.40
## 4 Undefined 883 1.17 99.57
## 5 FB 320 0.43 100.00
## reservation_status frequency percentage cumulative_perc
## 1 Check-Out 75166 100 100
## reserved_room_type frequency percentage cumulative_perc
## 1 A 52364 69.66 69.66
## 2 D 13099 17.43 87.09
## 3 E 4621 6.15 93.24
## 4 F 2017 2.68 95.92
## 5 G 1331 1.77 97.69
## 6 B 750 1.00 98.69
## 7 C 624 0.83 99.52
## 8 H 356 0.47 99.99
## 9 L 4 0.01 100.00
## [1] "Variables processed: arrival_date_month, assigned_room_type, customer_type, deposit_type, distribution_channel, hotel, market_segment, meal, reservation_status, reserved_room_type"
Observations:
#documentation: https://mran.microsoft.com/snapshot/2016-01-12/web/packages/GGally/vignettes/ggpairs.html
a %>% select(nom.vars) %>%
GGally::ggpairs(
columns = c(nom.vars[8], nom.vars[9]),
mapping = aes(color = eval(as.name(nom.vars[9])))
)#documentation: https://mran.microsoft.com/snapshot/2016-01-12/web/packages/GGally/vignettes/ggpairs.html
a %>% select(nom.vars) %>%
GGally::ggpairs(
columns = c(nom.vars[10], nom.vars[9]),
mapping = aes(color = eval(as.name(nom.vars[9])))
)Observations:
## [1] "adr" "adults"
## [3] "arrival_date_day_of_month" "arrival_date_week_number"
## [5] "arrival_date_year" "babies"
## [7] "booking_changes" "children"
## [9] "days_in_waiting_list" "is_repeated_guest"
## [11] "lead_time" "previous_bookings_not_canceled"
## [13] "previous_cancellations" "required_car_parking_spaces"
## [15] "stays_in_week_nights" "stays_in_weekend_nights"
## [17] "total_of_special_requests"
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(num.vars)` instead of `num.vars` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
| Name | Piped data |
| Number of rows | 75166 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| numeric | 17 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| adr | 0 | 1 | 99.99 | 49.21 | -6.38 | 67.5 | 92.5 | 125 | 510 |
| adults | 0 | 1 | 1.83 | 0.51 | 0.00 | 2.0 | 2.0 | 2 | 4 |
| arrival_date_day_of_month | 0 | 1 | 15.84 | 8.78 | 1.00 | 8.0 | 16.0 | 23 | 31 |
| arrival_date_week_number | 0 | 1 | 27.08 | 13.90 | 1.00 | 16.0 | 28.0 | 38 | 53 |
| arrival_date_year | 0 | 1 | 2016.15 | 0.70 | 2015.00 | 2016.0 | 2016.0 | 2017 | 2017 |
| babies | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.0 | 0.0 | 0 | 10 |
| booking_changes | 0 | 1 | 0.29 | 0.74 | 0.00 | 0.0 | 0.0 | 0 | 21 |
| children | 0 | 1 | 0.10 | 0.39 | 0.00 | 0.0 | 0.0 | 0 | 3 |
| days_in_waiting_list | 0 | 1 | 1.59 | 14.78 | 0.00 | 0.0 | 0.0 | 0 | 379 |
| is_repeated_guest | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.0 | 0.0 | 0 | 1 |
| lead_time | 0 | 1 | 79.98 | 91.11 | 0.00 | 9.0 | 45.0 | 124 | 737 |
| previous_bookings_not_canceled | 0 | 1 | 0.20 | 1.81 | 0.00 | 0.0 | 0.0 | 0 | 72 |
| previous_cancellations | 0 | 1 | 0.02 | 0.27 | 0.00 | 0.0 | 0.0 | 0 | 13 |
| required_car_parking_spaces | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.0 | 0.0 | 0 | 8 |
| stays_in_week_nights | 0 | 1 | 2.46 | 1.92 | 0.00 | 1.0 | 2.0 | 3 | 50 |
| stays_in_weekend_nights | 0 | 1 | 0.93 | 0.99 | 0.00 | 0.0 | 1.0 | 2 | 19 |
| total_of_special_requests | 0 | 1 | 0.71 | 0.83 | 0.00 | 0.0 | 1.0 | 1 | 5 |
a %>% select(num.vars) %>% DataExplorer::plot_histogram(
#scale_x = 'log10',
geom_histogram_args = list(bins = 50L),
ncol = 2, nrow = 2
)## Warning in log(x): NaNs produced
## Warning in sqrt(x): NaNs produced
## Warning in log(x): NaNs produced
## Warning in sqrt(x): NaNs produced
### useful when target var is a binary var
nom.vars.eXc.many.levels = a %>% select(nom.vars) %>% select(-company, -agent, -country) %>% colnames
a %>% funModeling::cross_plot(
input = nom.vars.eXc.many.levels,
target = 'hotel'
)#This function is used to analyze data when we need to reduce variable cardinality in predictive modeling.
#works in conjunction with 'cross_plot';
levels(a$hotel)
a %>% funModeling::categ_analysis(input = nom.vars[8], target = 'hotel') #mean_target, in reference to %Resort Hotel
a %>% funModeling::categ_analysis(input = nom.vars[10], target = 'hotel') #mean_target, in reference to %Resort Hotel#order reference: https://recipes.tidymodels.org/articles/Ordering.html
library(tidymodels)
rec.en = train %>% recipe(adr ~ . ) %>%
step_filter(adr > 0) %>%
step_log(adr, base = 10, offset = 0.01) %>%
#step_rm() %>% #excessive nas
#step_bagimpute(all_numeric(),-all_outcomes()) %>%
#step_knnimpute(all_nominal(),-all_outcomes()) %>%
step_corr(all_numeric(),-all_outcomes()) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_nzv(all_predictors(),-all_outcomes()) %>%
step_zv(all_predictors(),-all_outcomes()) %>%
step_normalize(all_numeric(),-all_outcomes())
rec.en %>% tidy#----------------------------
rec.rf = train %>% recipe(adr ~ . ) %>%
step_filter(adr > 0) %>%
step_log(adr, base = 10, offset = 0.01) %>%
#step_rm() %>% #excessive nas
#step_bagimpute(all_numeric(),-all_outcomes()) %>%
#step_knnimpute(all_nominal(),-all_outcomes()) %>%
step_corr(all_numeric(),-all_outcomes()) %>%
#step_dummy(all_nominal(), one_hot = TRUE) %>% #There are new levels in a factor
step_nzv(all_predictors(),-all_outcomes()) %>%
step_zv(all_predictors(),-all_outcomes()) %>%
step_normalize(all_numeric(),-all_outcomes())
rec.rf %>% tidymdl.en = parsnip::linear_reg(
penalty = tune(),
mixture = tune() #lasso/ridge mix
) %>%
set_mode('regression') %>%
set_engine('glmnet')
#----------------------------
mdl.rf = parsnip::rand_forest(
trees = 150,
min_n = tune(), #min number of observations at terminal node
mtry = tune() #number of vars to randomly subset at each node
) %>%
set_mode('regression') %>%
set_engine('ranger', importance = 'impurity_corrected')tg.en = tune_grid(
object = wf.en,
resamples = vfold,
grid = 5 #Create a tuning grid AUTOMATICALLY
)
tg.en %>% collect_metrics()#----------------------------
doParallel::registerDoParallel() #use parallel processing
set.seed(345)
tg.rf = tune_grid(
object = wf.rf,
resamples = vfold,
grid = 5 #Create a tuning grid AUTOMATICALLY
)
tg.rf %>% collect_metrics()ggplotly(
tg.en %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, penalty, mixture) %>%
pivot_longer(penalty:mixture,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(title = 'Elastic Net RMSE by Hyperparameter', x = NULL, y = '')
)#----------------------------
ggplotly(
tg.rf %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE, size = 3) +
facet_wrap(~parameter, scales = "free_x") +
labs(title = 'Random Forest RMSE by Hyperparameter', x = NULL, y = '')
)wf.en.fin = wf.en %>%
#1) finalize wf (recipe, model w/previously unknown hps) using best hps
finalize_workflow(best.hps.en) %>%
#2) fit on entire train
fit(train)
#----------------------------
wf.rf.fin = wf.rf %>%
#1) finalize wf (recipe, model w/previously unknown hps) using best hps
finalize_workflow(best.hps.rf) %>%
#2) fit on entire train
fit(train)#wf.en.fin %>% pull_workflow_fit() %>% vi
wf.rf.fin %>% pull_workflow_fit() %>% vip(geom = 'point') + labs(title = 'RF var importance')wf.en.fin %>%
last_fit(split) %>% #emulates the process where, after determining the best model, the final fit on the entire training set is needed and is then evaluated on the test set.
collect_predictions %>%
mutate(.pred = 10 ^ .pred, adr = 10 ^ adr) %>% # 'undo' the log10 transform
select(.pred, adr) %>%
#'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)#----------------------------
wf.rf.fin %>%
last_fit(split) %>% #emulates the process where, after determining the best model, the final fit on the entire training set is needed and is then evaluated on the test set.
collect_predictions %>%
mutate(.pred = 10 ^ .pred, adr = 10 ^ adr) %>% # 'undo' the log10 transform
select(.pred, adr) %>%
#'metric_set(rmse, rsq, mae)' is actually a in-line formula you create
metric_set(rmse, rsq, mae)(truth = adr, estimate = .pred)