——————————————————————————

Feature Engineering and Selection: A Practical Approach for Predictive Models

by Max Kuhn and Kjell Johnson

——————————————————————————

Code for Section 4.4 at

https://bookdown.org/max/FES/post-modeling.html

——————————————————————————

Code requires these packages:

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.10     ✔ recipes      1.3.1 
## ✔ dials        1.4.2      ✔ rsample      1.3.1 
## ✔ dplyr        1.1.4      ✔ tailor       0.1.0 
## ✔ ggplot2      4.0.0      ✔ tidyr        1.3.1 
## ✔ infer        1.0.9      ✔ tune         2.0.0 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.3.3      ✔ workflowsets 1.1.1 
## ✔ purrr        1.1.0      ✔ yardstick    1.3.2
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tailor' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.2
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:rsample':
## 
##     calibration
## The following object is masked from 'package:purrr':
## 
##     lift
theme_set(theme_bw())
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/chicago.RData")
load("C:/Users/Lenovo/Downloads/pRADYTHA/MACHINE LEARNING/TUGAS 1/stations.RData")
holidays <- 
  c("USNewYearsDay", "Jan02_Mon_Fri", "USMLKingsBirthday", 
    "USPresidentsDay", "USMemorialDay", "USIndependenceDay", 
    "Jul03_Mon_Fri", "Jul05_Mon_Fri", "USLaborDay", "USThanksgivingDay", 
    "Day_after_Thx", "ChristmasEve", "USChristmasDay", "Dec26_wkday", 
    "Dec31_Mon_Fri")

common_holiday <- 
  apply(training %>% dplyr::select(one_of(holidays)), 1, 
        function(x) ifelse(any(x == 1), 1, 0))

training <-
  training %>% 
  mutate(
    holiday  = common_holiday, 
    weekday = ifelse(dow %in% c("Sat", "Sun"), 0, 1)
  )
get_resid <- function(terms, next_term, return_mod = FALSE) {
  ctrl$verboseIter <- FALSE
  ctrl$predictionBounds <- c(0, NA)
  
  set.seed(4194)
  mod <- train(s_40380 ~ .,
               data = training[, c("s_40380", terms)],
               method = "lm",
               metric = "RMSE",
               maximize = FALSE,
               trControl = ctrl)
  
  x_mod <- train(as.formula(paste(next_term,"~ .")),
                 data = training[, c(terms, next_term)],
                 method = "lm",
                 metric = "RMSE",
                 maximize = FALSE,
                 trControl = ctrl)
  
  if(!return_mod) {
    out <- mod$pred
    out$Resample <- ymd(out$Resample)
    out$Date <- train_days[out$rowIndex]
    out$Month <- training$month[out$rowIndex]
    out$holiday <- training$holiday[out$rowIndex]
    out$weekday <- training$weekday[out$rowIndex]
    out$Residual <- out$obs - out$pred
    out$xResidual <- x_mod$pred$obs - x_mod$pred$pred
  } else out <- mod
  out
}
# There will be a warning about the "outcome only has two possible values". 
# This can be ignored.
base_resid <-
  get_resid(terms = c("year", "month", "week"),  next_term = "weekday") %>%
  mutate(Model = "Base Model")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
pow_resid <-
  get_resid(terms = c("year", "month", "week", "weekday"), next_term = "l14_40380") %>%
  mutate(Model = "Base + Part of Week")

l14_resid <-
  get_resid(
    terms = c("year", "month", "week", "weekday", "l14_40380"),
    next_term = "holiday"
  ) %>%
  mutate(Model = "Base + Part of Week + 14-Day Lag")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
hol_resid <- 
  get_resid(terms = c("year", "month", "week", "weekday", "l14_40380", "holiday"), 
            next_term = "l14_40370") %>%
  mutate(Model = "Base + Part of Week + 14-Day Lag + Holiday")

mod_lev <- c("Base Model", "Base + Part of Week", 
             "Base + Part of Week + 14-Day Lag",
             "Base + Part of Week + 14-Day Lag + Holiday")

model_resid <- 
  bind_rows(base_resid, pow_resid, l14_resid, hol_resid) %>%
  mutate(
    Model = factor(Model, levels = mod_lev),
    holiday = ifelse(holiday == 1, "yes", "no"),
    weekday = ifelse(weekday == 0, "Weekend", "Weekday")
  )

resid_hists <- 
  ggplot(model_resid, aes(x = Residual)) + 
  geom_vline(xintercept = 0, lty = 2, col = "darkgreen") + 
  geom_histogram(binwidth = 0.5, col = rgb(1, 1, 1, 0), fill = "blue", alpha = .5) + 
  facet_wrap(~Model, ncol = 1) + 
  xlab("Resampling Residual") + 
  ylab("Count") +
  ggtitle("(a)")

day_resid <-
  base_resid %>% 
  mutate(weekday = ifelse(weekday == 0, "Weekend", "Weekday")) %>% 
  ggplot(aes(x = xResidual, y = Residual)) + 
  geom_smooth(se = FALSE, method = lm, col = "black") +
  geom_point(aes(col = weekday, shape = weekday), alpha = .5) + 
  xlab("POW Model Residuals") +
  ylab("Base Model \n Residuals \n") +
  theme(
    legend.position = c(.2, .8), 
    legend.background = element_blank(), 
    legend.title = element_blank()
  ) +
  ggtitle("(b)")

l14_PR_resid <- 
  ggplot(pow_resid, aes(x = xResidual, y = Residual)) + 
  geom_point(alpha = .5) + 
  xlab("14-day Lag Model Residuals") +
  ylab("Base + POW Model \n Residuals \n") +
  ggtitle("(c)")

hol_PR_resid <- 
  l14_resid %>% 
  mutate(holiday = ifelse(holiday == 1, "yes", "no")) %>% 
  ggplot(aes(x = xResidual, y = Residual)) + 
  geom_smooth(se = FALSE, method = lm, col = "black") +
  geom_point(aes(col = holiday, shape = holiday), alpha = .5) + 
  xlab("Holiday Model Residuals") +
  ylab("Base + POW + \n 14-day Lag Model \n Residuals") +
  theme(
    legend.position = c(.2, .25), 
    legend.background = element_blank(), 
    legend.title = element_blank()
  ) +
  ggtitle("(d)")

# https://stackoverflow.com/questions/8112208/how-can-i-obtain-an-unbalanced-grid-of-ggplots
# grid.arrange(resid_hists, day_resid, l14_PR_resid, hol_PR_resid,
#              layout_matrix = cbind(c(1, 1, 1), c(2, 3, 4)))
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_Indonesia.utf8  LC_CTYPE=English_Indonesia.utf8   
## [3] LC_MONETARY=English_Indonesia.utf8 LC_NUMERIC=C                      
## [5] LC_TIME=English_Indonesia.utf8    
## 
## time zone: Asia/Jakarta
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] caret_7.0-1        lattice_0.22-6     gridExtra_2.3      lubridate_1.9.4   
##  [5] yardstick_1.3.2    workflowsets_1.1.1 workflows_1.3.0    tune_2.0.0        
##  [9] tidyr_1.3.1        tailor_0.1.0       rsample_1.3.1      recipes_1.3.1     
## [13] purrr_1.1.0        parsnip_1.3.3      modeldata_1.5.1    infer_1.0.9       
## [17] ggplot2_4.0.0      dplyr_1.1.4        dials_1.4.2        scales_1.4.0      
## [21] broom_1.0.10       tidymodels_1.4.1  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.37        rpart_4.1.23         timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.6-4       magrittr_2.0.3      
## [13] compiler_4.4.1       rlang_1.1.6          sass_0.4.9          
## [16] tools_4.4.1          yaml_2.3.10          data.table_1.16.2   
## [19] knitr_1.49           plyr_1.8.9           DiceDesign_1.10     
## [22] RColorBrewer_1.1-3   withr_3.0.2          stats4_4.4.1        
## [25] nnet_7.3-19          grid_4.4.1           future_1.67.0       
## [28] iterators_1.0.14     globals_0.18.0       MASS_7.3-60.2       
## [31] cli_3.6.5            rmarkdown_2.29       generics_0.1.3      
## [34] rstudioapi_0.17.1    future.apply_1.20.0  reshape2_1.4.4      
## [37] cachem_1.1.0         stringr_1.5.1        splines_4.4.1       
## [40] parallel_4.4.1       vctrs_0.6.5          hardhat_1.4.2       
## [43] Matrix_1.7-0         jsonlite_1.8.9       listenv_0.9.1       
## [46] foreach_1.5.2        gower_1.0.2          jquerylib_0.1.4     
## [49] glue_1.8.0           parallelly_1.45.1    codetools_0.2-20    
## [52] stringi_1.8.4        gtable_0.3.6         GPfit_1.0-9         
## [55] tibble_3.3.0         pillar_1.11.0        furrr_0.3.1         
## [58] htmltools_0.5.8.1    ipred_0.9-15         lava_1.8.1          
## [61] R6_2.5.1             lhs_1.2.0            evaluate_1.0.1      
## [64] backports_1.5.0      bslib_0.8.0          class_7.3-22        
## [67] Rcpp_1.0.13-1        nlme_3.1-164         prodlim_2025.04.28  
## [70] xfun_0.49            ModelMetrics_1.2.2.2 pkgconfig_2.0.3