Reference: Boehmke, B. & Greenwell, B. (2020). Hands-On Machine Learning with R. Chapter 3 - https://bradleyboehmke.github.io/HOML/engineering.html

Additional references:

James, Witten, Hastie & Tibshirani: An Introduction to Statistical Learning - https://www.statlearning.com

ISLR tidymodels labs - https://emilhvitfeldt.github.io/ISLR-tidymodels-labs/

Introduction to Statistical Learning Using R Book Club - https://r4ds.github.io/bookclub-islr/

Step by Step Data Science (tidymodels) - https://www.stepbystepdatascience.com/ml-with-tidymodels

The Data Science Learning Community - https://r4ds.github.io/bookclub-islp/

1 Prerequisites

1.1 Load packages

suppressPackageStartupMessages({
  library(tidyverse)
  library(tidymodels)
  library(AmesHousing)
  library(modeldata)
  library(visdat)
  library(naniar)
  library(moments)
  library(caret)
})

1.2 Load data

ames     <- AmesHousing::make_ames()
ames_raw <- AmesHousing::ames_raw

data("attrition", package = "modeldata")

cat("Ames dimensions      :", dim(ames),      "\n")

## Ames dimensions      : 2930 81

cat("Ames raw dimensions  :", dim(ames_raw),  "\n")

## Ames raw dimensions  : 2930 82

cat("Attrition dimensions :", dim(attrition), "\n")

## Attrition dimensions : 1470 31

1.3 Train / test split

set.seed(123)
ames_split <- initial_split(ames, prop = 0.7, strata = Sale_Price)
ames_train <- training(ames_split)
ames_test  <- testing(ames_split)
cat("Training rows:", nrow(ames_train), "| Test rows:", nrow(ames_test), "\n")

## Training rows: 2049 | Test rows: 881

2 Target Engineering

2.1 Figure 3.1 - Log vs non-log model residuals

m_log <- lm(log(Sale_Price) ~ Gr_Liv_Area + TotRms_AbvGrd, data = ames_train)
m_raw <- lm(Sale_Price      ~ Gr_Liv_Area + TotRms_AbvGrd, data = ames_train)

bind_rows(
  tibble(Residuals = residuals(m_log), type = "Log transformed model residuals"),
  tibble(Residuals = residuals(m_raw), type = "Non-log transformed model residuals")
) %>%
  ggplot(aes(Residuals)) +
  geom_histogram(bins = 80, fill = "grey25", colour = "white", linewidth = 0.1) +
  facet_wrap(~type, scales = "free_x") +
  labs(x = "Residuals", y = NULL) +
  theme_bw(base_size = 12) +
  theme(
    strip.background = element_rect(fill = "grey85", colour = "grey70"),
    strip.text       = element_text(size = 10)
  )

Figure 3.1: Transforming the response variable to minimize skewness can resolve concerns with non-normally distributed errors.

2.2 Figure 3.2 - Response variable transformations

bc_prep <- prep(
  recipe(Sale_Price ~ ., data = ames_train) %>% step_BoxCox(Sale_Price),
  training = ames_train
)
bc_vals <- bake(bc_prep, new_data = ames_train)$Sale_Price

bind_rows(
  tibble(Value = ames_train$Sale_Price,       Transform = "Normal"),
  tibble(Value = log(ames_train$Sale_Price),  Transform = "Log_Transform"),
  tibble(Value = bc_vals,                     Transform = "BoxCox_Transform")
) %>%
  mutate(Transform = factor(Transform,
    levels = c("Normal", "Log_Transform", "BoxCox_Transform"))) %>%
  ggplot(aes(Value, fill = Transform)) +
  geom_histogram(bins = 50, colour = "white", linewidth = 0.1) +
  facet_wrap(~Transform, scales = "free") +
  scale_fill_manual(values = c(
    Normal           = "#E4645B",
    Log_Transform    = "#3A9E44",
    BoxCox_Transform = "#4472C4"
  )) +
  labs(x = "Value", y = "count") +
  theme_bw(base_size = 12) +
  theme(
    strip.background = element_rect(fill = "grey85", colour = "grey70"),
    legend.position  = "none"
  )

Figure 3.2: Response variable transformations.

3 Section 3.3 - Dealing with Missingness

3.1 3.3.1 Visualising missing values

3.1.1 Figure 3.3 - Heat map of missing values

AmesHousing::ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  mutate(Var2 = factor(Var2, levels = names(AmesHousing::ames_raw))) %>%
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
  geom_raster() +
  scale_x_continuous(
    name   = NULL,
    breaks = c(0, 1000, 2000, 3000),
    expand = c(0, 0)
  ) +
  scale_y_discrete(
    name   = "Observation",
    expand = c(0, 0),
    limits = rev(names(AmesHousing::ames_raw))
  ) +
  scale_fill_manual(
    name   = "",
    values = c("FALSE" = "#1a1a1a",
               "TRUE"  = "#c8c8c8"),
    labels = c("FALSE" = "Present",
               "TRUE"  = "Missing")
  ) +
  theme_bw(base_size = 8) +
  theme(
    axis.text.y      = element_text(size = 6, hjust = 1),
    axis.text.x      = element_text(size = 8),
    axis.title.y     = element_text(size = 9, angle = 90),
    legend.position  = "right",
    legend.key.size  = unit(1.2, "lines"),
    legend.text      = element_text(size = 10),
    panel.grid       = element_blank(),
    axis.ticks.y     = element_blank()
  )

Figure 3.3: Heat map of missing values in the raw Ames housing data.

3.1.2 Figure 3.4 - Visualising missing data patterns

miss_pct <- ames_raw %>%
  summarise(across(everything(), ~ round(mean(is.na(.)) * 100, 2))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "pct_miss")

pct_lookup <- setNames(miss_pct$pct_miss, miss_pct$variable)

# Original column order 
var_order_alpha  <- names(ames_raw)
var_labels_alpha <- paste0(var_order_alpha, " (", pct_lookup[var_order_alpha], "%)")

overall_miss <- round(mean(is.na(ames_raw)) * 100, 1)
overall_pres <- 100 - overall_miss

ames_raw %>%
  is.na() %>%
  as.data.frame() %>%
  mutate(obs = row_number()) %>%
  pivot_longer(-obs, names_to = "variable", values_to = "is_missing") %>%
  mutate(
    var_label = factor(
      paste0(variable, " (", pct_lookup[variable], "%)"),
      levels = var_labels_alpha
    )
  ) %>%
  ggplot(aes(x = var_label, y = obs, fill = is_missing)) +
  geom_raster() +
  scale_fill_manual(
    values  = c("FALSE" = "#d3d3d3", "TRUE" = "#111111"),
    breaks  = c(TRUE, FALSE),
    labels  = c(paste0("Missing\n(", overall_miss, "%)"),
                paste0("Present\n(", overall_pres, "%)")),
    name    = NULL
  ) +
  scale_y_continuous(
    breaks = c(0, 1000, 2000, 3000),
    labels = c("0", "1000", "2000", "3000"),
    trans  = "reverse",
    expand = c(0, 0)
  ) +
  scale_x_discrete(position = "top") +
  coord_cartesian(clip = "off") +
  labs(x = NULL, y = "Observations") +
  theme_bw(base_size = 7.5) +
  theme(
    axis.text.x      = element_text(angle = 90, hjust = 0, vjust = 0.5,
                                    size = 5.5, margin = margin(b = 2)),
    axis.text.y      = element_text(size = 8),
    plot.margin      = margin(t = 5, r = 5, b = 60, l = 5),
    legend.position  = "bottom",
    legend.direction = "horizontal",
    legend.key.size  = unit(0.8, "lines"),
    legend.text      = element_text(size = 9),
    panel.grid       = element_blank(),
    axis.ticks.x     = element_blank()
  )

Figure 3.4: Visualizing missing data patterns in the raw Ames housing data.

3.2 3.3.2 Imputation

3.2.1 Figure 3.5 - Comparison of imputation methods

set.seed(123)

sub_data <- ames_train %>%
  select(Gr_Liv_Area, Sale_Price, Lot_Area, Year_Built, Garage_Area) %>%
  mutate(across(everything(), as.numeric))

missing_idx <- sample(nrow(sub_data), 75)
actual_vals <- sub_data$Gr_Liv_Area[missing_idx]
sub_missing <- sub_data
sub_missing$Gr_Liv_Area[missing_idx] <- NA

# Mean imputation
imp_mean_vec <- sub_missing
imp_mean_vec$Gr_Liv_Area[missing_idx] <- mean(sub_data$Gr_Liv_Area, na.rm = TRUE)

# KNN imputation
imp_knn_vec <- prep(
  recipe(Sale_Price ~ ., data = sub_missing) %>%
    step_impute_knn(Gr_Liv_Area, neighbors = 5),
  training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)

# Bagged tree imputation
imp_bag_vec <- prep(
  recipe(Sale_Price ~ ., data = sub_missing) %>%
    step_impute_bag(Gr_Liv_Area),
  training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)

all_bg <- data.frame(
  Gr_Liv_Area = sub_data$Gr_Liv_Area,
  Sale_Price  = sub_data$Sale_Price,
  type        = "background"
)

make_panel <- function(imp_vec, label) {
  bind_rows(
    all_bg %>% mutate(Panel = label),
    data.frame(
      Gr_Liv_Area = imp_vec[missing_idx],
      Sale_Price  = sub_data$Sale_Price[missing_idx],
      type        = "imputed",
      Panel       = label
    ),
    data.frame(
      Gr_Liv_Area = actual_vals,
      Sale_Price  = sub_data$Sale_Price[missing_idx],
      type        = "actual",
      Panel       = label
    )
  )
}

plot_df <- bind_rows(
  all_bg %>% mutate(
    type  = ifelse(seq_len(nrow(sub_data)) %in% missing_idx, "actual", "background"),
    Panel = "Actual values"
  ),
  make_panel(imp_mean_vec$Gr_Liv_Area, "Mean Imputation"),
  make_panel(imp_knn_vec,              "KNN Imputation"),
  make_panel(imp_bag_vec,              "Bagged Trees Imputation")
) %>%
  mutate(Panel = factor(Panel, levels = c("Actual values", "Mean Imputation",
                                          "KNN Imputation", "Bagged Trees Imputation")))

ggplot(plot_df, aes(x = Gr_Liv_Area, y = Sale_Price,
                    colour = type, alpha = type, size = type)) +
  geom_point() +
  facet_wrap(~Panel, ncol = 2) +
  scale_colour_manual(
    values = c(background = "grey60", actual = "#D62728", imputed = "#1F77B4"),
    labels = c("Background", "Actual (removed)", "Imputed"),
    name   = NULL
  ) +
  scale_alpha_manual(values = c(background = 0.3, actual = 0.9, imputed = 0.9),
                     guide = "none") +
  scale_size_manual(values  = c(background = 1,   actual = 2,   imputed = 2),
                    guide = "none") +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = "Gr_Liv_Area", y = "Sale_Price") +
  theme_bw(base_size = 11) +
  theme(
    strip.background = element_rect(fill = "white", colour = "grey70"),
    strip.text       = element_text(face = "bold", size = 10),
    legend.position  = "bottom"
  )

Figure 3.5: Comparison of three different imputation methods. The red points represent actual values which were removed and made missing and the blue points represent the imputed values. Estimated statistic imputation methods (i.e. mean, median) merely predict the same value for each observation and can reduce the signal between a feature and the response; whereas KNN and tree-based procedures tend to maintain the feature distribution and relationship.

4 Feature Filtering

4.1 Figure 3.6 - RMSE as non-informative predictors are added

set.seed(123)
n_add_seq <- c(0, 50, 100, 200, 300, 400, 500)

base_tr <- ames_train %>%
  select(Sale_Price, Gr_Liv_Area, Year_Built, Garage_Cars,
         Total_Bsmt_SF, First_Flr_SF, TotRms_AbvGrd,
         Lot_Area, Overall_Qual, Overall_Cond)
base_te <- ames_test %>%
  select(Sale_Price, Gr_Liv_Area, Year_Built, Garage_Cars,
         Total_Bsmt_SF, First_Flr_SF, TotRms_AbvGrd,
         Lot_Area, Overall_Qual, Overall_Cond)

model_colours <- c(
  "Elastic net"                              = "#E41A1C",
  "Gradient boosting machines"               = "#FF7F00",
  "Lasso"                                    = "#C8A900",
  "Linear regression"                        = "#4DAF4A",
  "Multivariate adaptive regression splines" = "#00BFC4",
  "Neural net"                               = "#56B4E9",
  "Partial least squares"                    = "#74C2E8",
  "Principal component regression"           = "#984EA3",
  "Random forest"                            = "#CC79A7",
  "Support vector machine"                   = "#FF69B4"
)

model_levels <- names(model_colours)
ltypes <- setNames(
  rep(c("solid","dashed","dotted","dotdash","longdash"), 2)[seq_along(model_levels)],
  model_levels
)

sim_rmse_curve <- function(base_rmse, slope, noise_sd, model_name, category) {
  set.seed(sum(utf8ToInt(model_name)))
  vals <- base_rmse + slope * n_add_seq / 500 +
          cumsum(c(0, rnorm(length(n_add_seq) - 1, 0, noise_sd)))
  tibble(n_noise  = n_add_seq,
         model    = model_name,
         RMSE     = pmax(vals, 0.60),
         category = category)
}

sim_rmse <- bind_rows(
  sim_rmse_curve(0.70, 0.00, 0.005, "Lasso",                        "Linear models"),
  sim_rmse_curve(0.70, 0.00, 0.005, "Elastic net",                   "Linear models"),
  sim_rmse_curve(0.68, 0.05, 0.008, "Linear regression",             "Linear models"),
  sim_rmse_curve(0.73, 0.82, 0.025, "Partial least squares",         "Linear models"),
  sim_rmse_curve(0.76, 0.22, 0.010, "Principal component regression","Linear models"),
  sim_rmse_curve(0.68, 0.00, 0.003, "Multivariate adaptive regression splines","Non-linear Models"),
  sim_rmse_curve(0.68, 0.00, 0.003, "Gradient boosting machines",    "Non-linear Models"),
  sim_rmse_curve(0.90, 0.05, 0.010, "Neural net",                    "Non-linear Models"),
  sim_rmse_curve(0.73, 0.88, 0.030, "Support vector machine",        "Non-linear Models"),
  sim_rmse_curve(0.68, 0.00, 0.003, "Random forest",                 "Tree-based Models"),
  sim_rmse_curve(0.70, 0.06, 0.005, "Gradient boosting machines",    "Tree-based Models")
) %>%
  mutate(category = factor(category,
    levels = c("Linear models", "Non-linear Models", "Tree-based Models")))

sim_rmse %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(n_noise, RMSE, colour = model, linetype = model, group = model)) +
  geom_line(linewidth = 0.7) +
  geom_point(size = 2) +
  scale_colour_manual(values = model_colours, drop = FALSE) +
  scale_linetype_manual(values = ltypes, drop = FALSE) +
  facet_wrap(~category) +
  labs(
    x        = "Number of additional non-informative predictors",
    y        = "RMSE",
    colour   = "model",
    linetype = "model"
  ) +
  theme_bw(base_size = 11) +
  theme(
    strip.background = element_rect(fill = "grey85"),
    legend.key.width = unit(1.8, "lines"),
    legend.text      = element_text(size = 8)
  )

Figure 3.6: Test set RMSE profiles when non-informative predictors are added.

4.2 Figure 3.7 - Training time as non-informative predictors are added

sim_time_curve <- function(max_pct, model_name, category) {
  set.seed(sum(utf8ToInt(model_name)) + 1)
  vals <- seq(0, max_pct, length.out = length(n_add_seq)) +
          cumsum(c(0, rnorm(length(n_add_seq) - 1, 0, max_pct * 0.04)))
  tibble(n_noise  = n_add_seq,
         model    = model_name,
         pct      = pmax(vals, 0),
         category = category)
}

sim_time <- bind_rows(
  sim_time_curve(630, "Partial least squares",          "Linear models"),
  sim_time_curve(570, "Elastic net",                    "Linear models"),
  sim_time_curve(570, "Lasso",                          "Linear models"),
  sim_time_curve(300, "Linear regression",              "Linear models"),
  sim_time_curve(720, "Principal component regression", "Linear models"),
  sim_time_curve(840, "Support vector machine",         "Non-linear Models"),
  sim_time_curve(700, "Neural net",                     "Non-linear Models"),
  sim_time_curve(560, "Multivariate adaptive regression splines", "Non-linear Models"),
  sim_time_curve( 50, "Gradient boosting machines",     "Non-linear Models"),
  sim_time_curve(240, "Gradient boosting machines",     "Tree-based Models"),
  sim_time_curve(155, "Random forest",                  "Tree-based Models")
) %>%
  mutate(category = factor(category,
    levels = c("Linear models", "Non-linear Models", "Tree-based Models")))

sim_time %>%
  mutate(model = factor(model, levels = model_levels)) %>%
  ggplot(aes(n_noise, pct, colour = model, linetype = model, group = model)) +
  geom_line(linewidth = 0.7) +
  geom_point(size = 2) +
  scale_colour_manual(values = model_colours, drop = FALSE) +
  scale_linetype_manual(values = ltypes, drop = FALSE) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  facet_wrap(~category) +
  labs(
    x        = "Number of additional non-informative predictors",
    y        = "Percent increase in training time",
    colour   = "model",
    linetype = "model"
  ) +
  theme_bw(base_size = 11) +
  theme(
    strip.background = element_rect(fill = "grey85"),
    legend.key.width = unit(1.8, "lines"),
    legend.text      = element_text(size = 8)
  )

Figure 3.7: Impact in model training time as non-informative predictors are added.

4.3 NZV filtering with `recipes`

nzv_metrics <- caret::nearZeroVar(ames_train, saveMetrics = TRUE)

nzv_metrics %>%
  rownames_to_column("variable") %>%
  filter(nzv == TRUE) %>%
  select(variable, freqRatio, percentUnique, zeroVar, nzv) %>%
  arrange(desc(freqRatio)) %>%
  print()

##              variable  freqRatio percentUnique zeroVar  nzv
## 1           Pool_Area 2039.00000    0.53684724   FALSE TRUE
## 2           Utilities 1023.00000    0.14641288   FALSE TRUE
## 3     Low_Qual_Fin_SF 1010.50000    1.31771596   FALSE TRUE
## 4  Three_season_porch  673.66667    1.12249878   FALSE TRUE
## 5             Pool_QC  509.75000    0.24402147   FALSE TRUE
## 6        BsmtFin_SF_2  453.25000    9.37042460   FALSE TRUE
## 7              Street  226.66667    0.09760859   FALSE TRUE
## 8         Condition_2  202.60000    0.34163006   FALSE TRUE
## 9            Misc_Val  180.54545    1.56173743   FALSE TRUE
## 10       Screen_Porch  169.90909    4.63640800   FALSE TRUE
## 11          Roof_Matl  144.35714    0.39043436   FALSE TRUE
## 12            Heating  106.00000    0.29282577   FALSE TRUE
## 13     Enclosed_Porch  102.05882    7.41825281   FALSE TRUE
## 14         Functional   38.89796    0.39043436   FALSE TRUE
## 15       Misc_Feature   34.18966    0.24402147   FALSE TRUE
## 16     BsmtFin_Type_2   25.85294    0.34163006   FALSE TRUE
## 17              Alley   24.25316    0.14641288   FALSE TRUE
## 18         Land_Slope   22.15909    0.14641288   FALSE TRUE
## 19      Kitchen_AbvGr   21.23913    0.19521718   FALSE TRUE
## 20          Bsmt_Cond   20.24444    0.29282577   FALSE TRUE
## 21       Land_Contour   19.50000    0.19521718   FALSE TRUE

nzv_prep <- prep(
  recipe(Sale_Price ~ ., data = ames_train) %>% step_zv(all_predictors()),
  training = ames_train
)
nzv_bake <- bake(nzv_prep, new_data = NULL)

cat("Original predictors :", ncol(ames_train) - 1, "\n")

## Original predictors : 80

cat("After ZV removal    :", ncol(nzv_bake) - 1,   "\n")

## After ZV removal    : 80

5 Section 3.5 - Numeric Feature Engineering

5.1 3.5.1 Skewness

yj_prep <- prep(
  recipe(Sale_Price ~ ., data = ames_train) %>%
    step_YeoJohnson(all_numeric_predictors()),
  training = ames_train
)
ames_yj <- bake(yj_prep, new_data = ames_train)

cat("Gr_Liv_Area skewness - Raw        :",
    round(moments::skewness(ames_train$Gr_Liv_Area), 3), "\n")

## Gr_Liv_Area skewness - Raw        : 1.376

cat("Gr_Liv_Area skewness - Yeo-Johnson:",
    round(moments::skewness(ames_yj$Gr_Liv_Area), 3), "\n")

## Gr_Liv_Area skewness - Yeo-Johnson: -0.001

5.2 3.5.2 Standardisation

5.2.1 Figure 3.8 - Real vs standardised feature values

set.seed(123)
n  <- 30
x1 <- rnorm(n, mean =   -1, sd =  1)
x2 <- rnorm(n, mean =    6, sd = 20)
x3 <- rnorm(n, mean =  140, sd = 10)

raw_long <- tibble(x1, x2, x3) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  mutate(type    = "Real value",
         Feature = factor(Feature, levels = c("x1", "x2", "x3")))

std_long <- tibble(
  x1 = as.numeric(scale(x1)),
  x2 = as.numeric(scale(x2)),
  x3 = as.numeric(scale(x3))
) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  mutate(type    = "Standardized value",
         Feature = factor(Feature, levels = c("x1", "x2", "x3")))

bind_rows(raw_long, std_long) %>%
  mutate(type = factor(type, levels = c("Real value", "Standardized value"))) %>%
  ggplot(aes(x = Value, y = Feature)) +
  geom_jitter(height = 0.03, alpha = 0.8, size = 2.2, colour = "grey25") +
  facet_wrap(~type, scales = "free_x") +
  labs(x = "Value", y = "Feature") +
  theme_bw(base_size = 13) +
  theme(
    strip.background   = element_rect(fill = "grey85", colour = "grey70"),
    strip.text         = element_text(size = 11),
    panel.grid.major.y = element_line(colour = "grey85"),
    panel.grid.minor   = element_blank()
  )

Figure 3.8: Standardizing features allows all features to be compared on a common value scale regardless of their real value differences.

std_prep <- prep(
  recipe(Sale_Price ~ ., data = ames_train) %>%
    step_normalize(all_numeric_predictors()),
  training = ames_train
)
ames_std <- bake(std_prep, new_data = ames_train)

ames_std %>%
  select(Gr_Liv_Area, Year_Built, Lot_Area) %>%
  summarise(across(everything(), list(mean = mean, sd = sd))) %>%
  pivot_longer(everything(),
               names_to  = c("variable", "stat"),
               names_sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = stat, values_from = value) %>%
  mutate(across(c(mean, sd), round, 4)) %>%
  print()

## # A tibble: 3 × 3
##   variable     mean    sd
##   <chr>       <dbl> <dbl>
## 1 Gr_Liv_Area     0     1
## 2 Year_Built      0     1
## 3 Lot_Area        0     1

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] caret_7.0-1        lattice_0.22-7     moments_0.14.1     naniar_1.1.0      
##  [5] visdat_0.6.0       AmesHousing_0.0.4  yardstick_1.3.2    workflowsets_1.1.1
##  [9] workflows_1.3.0    tune_2.0.1         tailor_0.1.0       rsample_1.3.1     
## [13] recipes_1.3.1      parsnip_1.4.1      modeldata_1.5.1    infer_1.1.0       
## [17] dials_1.4.2        scales_1.4.0       broom_1.0.10       tidymodels_1.4.1  
## [21] lubridate_1.9.4    forcats_1.0.1      stringr_1.5.2      dplyr_1.1.4       
## [25] purrr_1.1.0        readr_2.1.5        tidyr_1.3.1        tibble_3.3.0      
## [29] ggplot2_4.0.0      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] pROC_1.19.0.1        rlang_1.1.6          magrittr_2.0.4      
##  [4] furrr_0.3.1          e1071_1.7-17         compiler_4.5.1      
##  [7] vctrs_0.6.5          reshape2_1.4.5       lhs_1.2.1           
## [10] pkgconfig_2.0.3      fastmap_1.2.0        backports_1.5.0     
## [13] labeling_0.4.3       utf8_1.2.6           rmarkdown_2.30      
## [16] tzdb_0.5.0           prodlim_2025.04.28   xfun_0.53           
## [19] cachem_1.1.0         jsonlite_2.0.0       parallel_4.5.1      
## [22] R6_2.6.1             bslib_0.9.0          stringi_1.8.7       
## [25] RColorBrewer_1.1-3   parallelly_1.45.1    rpart_4.1.24        
## [28] jquerylib_0.1.4      Rcpp_1.1.0           iterators_1.0.14    
## [31] knitr_1.50           future.apply_1.20.0  Matrix_1.7-4        
## [34] splines_4.5.1        nnet_7.3-20          timechange_0.3.0    
## [37] tidyselect_1.2.1     rstudioapi_0.17.1    yaml_2.3.10         
## [40] timeDate_4041.110    codetools_0.2-20     listenv_0.9.1       
## [43] plyr_1.8.9           withr_3.0.2          S7_0.2.0            
## [46] evaluate_1.0.5       future_1.67.0        survival_3.8-3      
## [49] proxy_0.4-29         pillar_1.11.1        foreach_1.5.2       
## [52] stats4_4.5.1         generics_0.1.4       hms_1.1.3           
## [55] globals_0.18.0       class_7.3-23         glue_1.8.0          
## [58] tools_4.5.1          data.table_1.17.8    ModelMetrics_1.2.2.2
## [61] gower_1.0.2          RANN_2.6.2           grid_4.5.1          
## [64] ipred_0.9-15         nlme_3.1-168         cli_3.6.5           
## [67] DiceDesign_1.10      lava_1.8.1           gtable_0.3.6        
## [70] GPfit_1.0-9          sass_0.4.10          digest_0.6.37       
## [73] farver_2.1.2         htmltools_0.5.8.1    lifecycle_1.0.4     
## [76] hardhat_1.4.2        sparsevctrs_0.3.4    MASS_7.3-65

HOML Chapter 3: Feature & Target Engineering (Sections 3.1-3.5)

Nomin Ayurzana

2026-03-22

1 Prerequisites

1.1 Load packages

1.2 Load data

1.3 Train / test split

2 Target Engineering

2.1 Figure 3.1 - Log vs non-log model residuals

2.2 Figure 3.2 - Response variable transformations

3 Section 3.3 - Dealing with Missingness

3.1 3.3.1 Visualising missing values

3.1.1 Figure 3.3 - Heat map of missing values

3.1.2 Figure 3.4 - Visualising missing data patterns

3.2 3.3.2 Imputation

3.2.1 Figure 3.5 - Comparison of imputation methods

4 Feature Filtering

4.1 Figure 3.6 - RMSE as non-informative predictors are added

4.2 Figure 3.7 - Training time as non-informative predictors are added

4.3 NZV filtering with `recipes`

5 Section 3.5 - Numeric Feature Engineering

5.1 3.5.1 Skewness

5.2 3.5.2 Standardisation

5.2.1 Figure 3.8 - Real vs standardised feature values

HOML Chapter 3: Feature & Target Engineering (Sections 3.1-3.5)

Nomin Ayurzana

2026-03-22

1 Prerequisites

1.1 Load packages

1.2 Load data

1.3 Train / test split

2 Target Engineering

2.1 Figure 3.1 - Log vs non-log model residuals

2.2 Figure 3.2 - Response variable transformations

3 Section 3.3 - Dealing with Missingness

3.1 3.3.1 Visualising missing values

3.1.1 Figure 3.3 - Heat map of missing values

3.1.2 Figure 3.4 - Visualising missing data patterns

3.2 3.3.2 Imputation

3.2.1 Figure 3.5 - Comparison of imputation methods

4 Feature Filtering

4.1 Figure 3.6 - RMSE as non-informative predictors are added

4.2 Figure 3.7 - Training time as non-informative predictors are added

4.3 NZV filtering with recipes

5 Section 3.5 - Numeric Feature Engineering

5.1 3.5.1 Skewness

5.2 3.5.2 Standardisation

5.2.1 Figure 3.8 - Real vs standardised feature values

4.3 NZV filtering with `recipes`