——————————————————————————

Feature Engineering and Selection: A Practical Approach for Predictive Models

by Max Kuhn and Kjell Johnson

——————————————————————————

Code for Section 6.1 at

https://bookdown.org/max/FES/numeric-one-to-one.html

——————————————————————————

Code requires these packages:

# 
# Code requires these packages: 

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.10     ✔ rsample      1.3.1 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ dplyr        1.1.4      ✔ tidyr        1.3.1 
## ✔ infer        1.0.9      ✔ tune         2.0.0 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.3.3      ✔ workflowsets 1.1.1 
## ✔ purrr        1.1.0      ✔ yardstick    1.3.2 
## ✔ recipes      1.3.1
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tailor' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ rsample::calibration()   masks caret::calibration()
## ✖ purrr::discard()         masks scales::discard()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ purrr::lift()            masks caret::lift()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
theme_set(theme_bw() + theme(legend.position = "top"))
data("segmentationData")
segmentationData$Cell <- NULL
segmentationData <- segmentationData[, c("EqSphereAreaCh1", "PerimCh1", "Class", "Case")]
names(segmentationData)[1:2] <- paste0("Predictor", LETTERS[1:2])

example_train <- subset(segmentationData, Case == "Train")
example_test  <- subset(segmentationData, Case == "Test")

example_train$Case <- NULL
example_test$Case  <- NULL

simple_trans_rec <- recipe(Class ~ ., data = example_train) %>%
  step_BoxCox(PredictorA, PredictorB) %>%
  prep(training = example_train)

simple_trans_test <- bake(simple_trans_rec, example_test)
pred_b_lambda <-
  tidy(simple_trans_rec, number = 1) %>% 
  filter(terms == "PredictorB") %>% 
  select(value)

bc_before <- ggplot(example_test, aes(x = PredictorB)) + 
  geom_histogram(bins = 35, col = "blue", fill = "blue", alpha = .6) + 
  xlab("Predictor B") + 
  ggtitle("(a)")
bc_after <- ggplot(simple_trans_test, aes(x = PredictorB)) + 
  geom_histogram(bins = 35, col = "red", fill = "red", alpha = .6) + 
  xlab("Predictor B (inverse)") + 
  ggtitle("(b)")
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_Indonesia.utf8  LC_CTYPE=English_Indonesia.utf8   
## [3] LC_MONETARY=English_Indonesia.utf8 LC_NUMERIC=C                      
## [5] LC_TIME=English_Indonesia.utf8    
## 
## time zone: Asia/Jakarta
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] yardstick_1.3.2    workflowsets_1.1.1 workflows_1.3.0    tune_2.0.0        
##  [5] tidyr_1.3.1        tailor_0.1.0       rsample_1.3.1      recipes_1.3.1     
##  [9] purrr_1.1.0        parsnip_1.3.3      modeldata_1.5.1    infer_1.0.9       
## [13] dplyr_1.1.4        dials_1.4.2        scales_1.4.0       broom_1.0.10      
## [17] tidymodels_1.4.1   caret_7.0-1        lattice_0.22-6     ggplot2_4.0.0     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.37        rpart_4.1.23         timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.6-4       magrittr_2.0.3      
## [13] compiler_4.4.1       rlang_1.1.6          sass_0.4.9          
## [16] tools_4.4.1          yaml_2.3.10          data.table_1.16.2   
## [19] knitr_1.49           plyr_1.8.9           DiceDesign_1.10     
## [22] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-19         
## [25] grid_4.4.1           stats4_4.4.1         future_1.67.0       
## [28] globals_0.18.0       iterators_1.0.14     MASS_7.3-60.2       
## [31] cli_3.6.5            rmarkdown_2.29       generics_0.1.3      
## [34] rstudioapi_0.17.1    future.apply_1.20.0  reshape2_1.4.4      
## [37] cachem_1.1.0         stringr_1.5.1        splines_4.4.1       
## [40] parallel_4.4.1       vctrs_0.6.5          hardhat_1.4.2       
## [43] Matrix_1.7-0         jsonlite_1.8.9       listenv_0.9.1       
## [46] foreach_1.5.2        gower_1.0.2          jquerylib_0.1.4     
## [49] glue_1.8.0           parallelly_1.45.1    codetools_0.2-20    
## [52] lubridate_1.9.4      stringi_1.8.4        gtable_0.3.6        
## [55] GPfit_1.0-9          tibble_3.3.0         pillar_1.11.0       
## [58] furrr_0.3.1          htmltools_0.5.8.1    ipred_0.9-15        
## [61] lava_1.8.1           R6_2.5.1             lhs_1.2.0           
## [64] evaluate_1.0.1       backports_1.5.0      bslib_0.8.0         
## [67] class_7.3-22         Rcpp_1.0.13-1        nlme_3.1-164        
## [70] prodlim_2025.04.28   xfun_0.49            ModelMetrics_1.2.2.2
## [73] pkgconfig_2.0.3