Reference: Boehmke, B. & Greenwell, B. (2020). Hands-On Machine Learning with R.
Chapter 3: Feature & Target Engineering.
Online: https://bradleyboehmke.github.io/HOML/engineering.html


3.1 Prerequisites

pkgs <- c("dplyr", "ggplot2", "recipes", "rsample",
          "AmesHousing", "forecast", "broom",
          "purrr", "tidyr", "scales", "tibble")
install.packages(setdiff(pkgs, rownames(installed.packages())))
# Core helpers
library(dplyr)
library(ggplot2)
library(tibble)
library(tidyr)
library(purrr)
library(scales)

# ML / feature engineering
library(AmesHousing)  # required for make_ames() and ames_raw
library(rsample)
library(recipes)
library(broom)
library(forecast)

ggplot2::theme_set(ggplot2::theme_light())
ames <- AmesHousing::make_ames()

set.seed(123)
split      <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test  <- testing(split)

cat("Training rows:", nrow(ames_train), "\n")
## Training rows: 2049
cat("Test rows:    ", nrow(ames_test),  "\n")
## Test rows:     881

3.2 Target Engineering

Figure 3.1 – Effect of log-transforming the target

models <- c(
  "Non-log transformed model residuals",
  "Log transformed model residuals"
)

list(
  m1 = lm(Sale_Price       ~ Year_Built, data = ames_train),
  m2 = lm(log(Sale_Price)  ~ Year_Built, data = ames_train)
) %>%
  map2_dfr(models, ~ augment(.x) %>% mutate(model = .y)) %>%
  ggplot(aes(.resid)) +
  geom_histogram(bins = 75) +
  facet_wrap(~ model, scales = "free_x") +
  ylab(NULL) +
  xlab("Residuals") +
  ggtitle("Figure 3.1: Residuals before and after log-transforming Sale_Price")

Applying the log transformation

# Manual log transformation
transformed_response <- log(ames_train$Sale_Price)
head(transformed_response)
## [1] 11.56647 11.38509 11.69525 11.73607 11.11988 11.62625
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe

Why log1p? When values can equal zero, log(0) = -Inf. log1p() computes log(1 + x) and is safe for x > -1:

log(-0.5)    # NaN
## [1] NaN
log1p(-0.5)  # -0.693
## [1] -0.6931472

Figure 3.2 – Comparing Normal, Log, and Box-Cox distributions

The Box-Cox family of transformations generalises the log:

\[y(\lambda) = \begin{cases} \frac{Y^\lambda - 1}{\lambda}, & \lambda \neq 0 \\ \log(Y), & \lambda = 0 \end{cases}\]

# Log transformation of training target
train_log_y <- log(ames_train$Sale_Price)

# Box-Cox: estimate lambda from training data, apply to both sets
lambda     <- BoxCox.lambda(ames_train$Sale_Price)
cat("Estimated lambda:", round(lambda, 4), "\n")
## Estimated lambda: 0.1387
train_bc_y <- BoxCox(ames_train$Sale_Price, lambda)

# Compare distributions
levs <- c("Normal", "Log_Transform", "BoxCox_Transform")

data.frame(
  Normal           = ames_train$Sale_Price,
  Log_Transform    = train_log_y,
  BoxCox_Transform = train_bc_y
) %>%
  gather(Transform, Value) %>%
  mutate(Transform = factor(Transform, levels = levs)) %>%
  ggplot(aes(Value, fill = Transform)) +
  geom_histogram(show.legend = FALSE, bins = 40) +
  facet_wrap(~ Transform, scales = "free_x") +
  ggtitle("Figure 3.2: Normal, Log, and Box-Cox transformed Sale_Price")


3.3 Dealing with Missingness

3.3.1 Visualising missing values

ames_raw <- AmesHousing::ames_raw
cat("Total missing values in raw Ames data:", sum(is.na(ames_raw)), "\n")
## Total missing values in raw Ames data: 13997

Figure 3.3 – Missing-value heatmap (reproduced without visdat):

# Build a tidy missingness matrix
miss_df <- ames_raw %>%
  summarise(across(everything(), ~ mean(is.na(.)))) %>%
  pivot_longer(everything(),
               names_to  = "Variable",
               values_to = "PctMissing") %>%
  filter(PctMissing > 0) %>%
  arrange(desc(PctMissing))

miss_df
## # A tibble: 27 × 2
##    Variable      PctMissing
##    <chr>              <dbl>
##  1 Pool QC           0.996 
##  2 Misc Feature      0.964 
##  3 Alley             0.932 
##  4 Fence             0.805 
##  5 Fireplace Qu      0.485 
##  6 Lot Frontage      0.167 
##  7 Garage Yr Blt     0.0543
##  8 Garage Finish     0.0543
##  9 Garage Qual       0.0543
## 10 Garage Cond       0.0543
## # ℹ 17 more rows
miss_df %>%
  ggplot(aes(x = reorder(Variable, PctMissing), y = PctMissing)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  scale_y_continuous(labels = percent_format()) +
  xlab(NULL) +
  ylab("Proportion Missing") +
  ggtitle("Figure 3.3: Variables with missing values (Ames raw data)")

3.3.2 Imputation

3.3.2.1 Estimated statistic (mean / median)

ames_recipe_impute <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_mean(all_numeric_predictors())

ames_recipe_impute

3.3.2.2 K-Nearest Neighbor imputation

ames_recipe_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)

ames_recipe_knn

3.3.2.3 Tree-based (bagged trees) imputation

ames_recipe_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_bag(all_predictors())

ames_recipe_bag

3.4 Feature Filtering

# Base-R near-zero variance detection (no caret needed)
# A feature is NZV if: freqRatio > 19  OR  percentUnique < 10
nzv_check <- function(x) {
  if (!is.numeric(x)) return(FALSE)
  x <- x[!is.na(x)]
  if (length(x) == 0) return(TRUE)
  tbl           <- sort(table(x), decreasing = TRUE)
  freq_ratio    <- if (length(tbl) > 1) tbl[[1]] / tbl[[2]] else Inf
  pct_unique    <- length(tbl) / length(x) * 100
  freq_ratio > 19 | pct_unique < 10
}

nzv_features <- ames_train %>%
  select(where(is.numeric)) %>%
  summarise(across(everything(), nzv_check)) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "nzv") %>%
  filter(nzv == TRUE)

cat("Near-zero variance features found:", nrow(nzv_features), "\n")
## Near-zero variance features found: 26
nzv_features
## # A tibble: 26 × 2
##    Feature         nzv  
##    <chr>           <lgl>
##  1 Lot_Frontage    TRUE 
##  2 Year_Built      TRUE 
##  3 Year_Remod_Add  TRUE 
##  4 Mas_Vnr_Area    TRUE 
##  5 BsmtFin_SF_1    TRUE 
##  6 BsmtFin_SF_2    TRUE 
##  7 Second_Flr_SF   TRUE 
##  8 Low_Qual_Fin_SF TRUE 
##  9 Bsmt_Full_Bath  TRUE 
## 10 Bsmt_Half_Bath  TRUE 
## # ℹ 16 more rows
ames_recipe_nzv <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors())

ames_recipe_nzv

3.5 Numeric Feature Engineering

3.5.1 Skewness

Numeric predictors can be skewed just like the target. We measure skewness and flag features that exceed a common threshold of |skewness| > 1.

# Base-R skewness: third standardised moment
skewness_br <- function(x) {
  x   <- x[!is.na(x)]
  n   <- length(x)
  m   <- mean(x)
  s   <- sd(x)
  if (s == 0 || n < 3) return(NA_real_)
  (sum((x - m)^3) / n) / s^3
}

skew_vals <- ames_train %>%
  select(where(is.numeric), -Sale_Price) %>%
  summarise(across(everything(), skewness_br)) %>%
  pivot_longer(everything(),
               names_to  = "Feature",
               values_to = "Skewness") %>%
  arrange(desc(abs(Skewness)))

cat("Features with |skewness| > 1:", sum(abs(skew_vals$Skewness) > 1, na.rm = TRUE), "\n")
## Features with |skewness| > 1: 16
head(skew_vals, 10)
## # A tibble: 10 × 2
##    Feature            Skewness
##    <chr>                 <dbl>
##  1 Misc_Val              20.7 
##  2 Pool_Area             16.7 
##  3 Lot_Area              13.0 
##  4 Three_season_porch    11.3 
##  5 Low_Qual_Fin_SF       10.6 
##  6 Kitchen_AbvGr          4.45
##  7 Enclosed_Porch         4.40
##  8 BsmtFin_SF_2           4.16
##  9 Bsmt_Half_Bath         4.00
## 10 Screen_Porch           3.75
skew_vals %>%
  ggplot(aes(Skewness)) +
  geom_histogram(bins = 30, fill = "steelblue", colour = "white") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", colour = "red") +
  xlab("Skewness") + ylab("Count") +
  ggtitle("Distribution of skewness across numeric predictors",
          subtitle = "Red dashed lines at ±1 — common threshold for transformation")

ames_recipe_yj <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors())

ames_recipe_yj

3.5.2 Standardisation

Algorithms that rely on distance or gradients (KNN, SVM, neural networks, regularised regression) require features to be on a common scale.

ames_recipe_norm <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_normalize(all_numeric_predictors())   # centers then scales

ames_recipe_norm

Before vs. after standardisation — Gr_Liv_Area:

ames_prepped <- prep(ames_recipe_norm, training = ames_train)
ames_baked   <- bake(ames_prepped, new_data = ames_train)

ggplot(ames_train, aes(Gr_Liv_Area)) +
  geom_histogram(bins = 50, fill = "coral", colour = "white") +
  ggtitle("Before normalisation: Gr_Liv_Area")

ggplot(ames_baked, aes(Gr_Liv_Area)) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  ggtitle("After normalisation: Gr_Liv_Area (mean=0, sd=1)")

Summary statistics confirm the transformation:

cat("--- Before ---\n")
## --- Before ---
cat("Mean:", round(mean(ames_train$Gr_Liv_Area), 2),
    " SD:", round(sd(ames_train$Gr_Liv_Area), 2), "\n\n")
## Mean: 1504.41  SD: 518.37
cat("--- After ---\n")
## --- After ---
cat("Mean:", round(mean(ames_baked$Gr_Liv_Area), 6),
    " SD:", round(sd(ames_baked$Gr_Liv_Area), 4), "\n")
## Mean: 0  SD: 1

Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forecast_9.0.1    broom_1.0.12      recipes_1.3.1     rsample_1.3.2    
##  [5] AmesHousing_0.0.4 scales_1.4.0      purrr_1.2.1       tidyr_1.3.2      
##  [9] tibble_3.3.1      ggplot2_4.0.2     dplyr_1.2.0      
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6        xfun_0.56           bslib_0.10.0       
##  [4] lattice_0.22-7      vctrs_0.7.1         tools_4.5.2        
##  [7] generics_0.1.4      parallel_4.5.2      pkgconfig_2.0.3    
## [10] Matrix_1.7-4        data.table_1.18.2.1 RColorBrewer_1.1-3 
## [13] S7_0.2.1            lifecycle_1.0.5     compiler_4.5.2     
## [16] farver_2.1.2        codetools_0.2-20    htmltools_0.5.9    
## [19] class_7.3-23        sass_0.4.10         yaml_2.3.12        
## [22] prodlim_2025.04.28  pillar_1.11.1       furrr_0.3.1        
## [25] jquerylib_0.1.4     MASS_7.3-65         cachem_1.1.0       
## [28] gower_1.0.2         rpart_4.1.24        nlme_3.1-168       
## [31] parallelly_1.46.1   lava_1.8.2          fracdiff_1.5-3     
## [34] tidyselect_1.2.1    digest_0.6.39       future_1.69.0      
## [37] listenv_0.10.0      labeling_0.4.3      splines_4.5.2      
## [40] fastmap_1.2.0       grid_4.5.2          colorspace_2.1-2   
## [43] cli_3.6.5           magrittr_2.0.4      utf8_1.2.6         
## [46] survival_3.8-3      future.apply_1.20.2 withr_3.0.2        
## [49] backports_1.5.0     lubridate_1.9.5     timechange_0.4.0   
## [52] rmarkdown_2.30      globals_0.19.0      nnet_7.3-20        
## [55] timeDate_4052.112   zoo_1.8-15          urca_1.3-4         
## [58] evaluate_1.0.5      knitr_1.51          hardhat_1.4.2      
## [61] rlang_1.1.7         Rcpp_1.1.1          glue_1.8.0         
## [64] ipred_0.9-15        jsonlite_2.0.0      R6_2.6.1