Overview

This document reproduces all code outputs from Chapter 3: Feature & Target Engineering (Sections 3.1 – 3.5) of the online book Hands-On Machine Learning with R by Bradley Boehmke & Brandon Greenwell.

Note on the dataset: AmesHousing::make_ames() pre-cleans all missing values. To faithfully demonstrate Section 3.3, we use the raw Ames data (AmesHousing::ames_raw) which retains real NAs, then split it for training/testing.

Section	Topic
3.1	Prerequisites
3.2	Target engineering
3.3	Dealing with missingness
3.4	Feature filtering
3.5	Numeric feature engineering

3.1 Prerequisites

# Helper packages
library(dplyr)       # data manipulation
library(ggplot2)     # graphics
library(visdat)      # missing value visualisation

# Feature engineering packages
library(caret)       # ML tasks
library(recipes)     # feature engineering blueprints

# Supporting packages
library(AmesHousing) # Ames housing dataset (raw + cleaned)
library(rsample)     # train/test splitting
library(patchwork)   # combining ggplots
library(naniar)      # missing data tools
library(e1071)       # skewness()
library(corrplot)    # correlation heatmap
library(forecast)    # BoxCox.lambda()
library(scales)      # dollar labels
library(tidyr)       # pivot_longer()

# Use RAW Ames data so missing values are preserved for Section 3.3
set.seed(123)

ames_raw   <- AmesHousing::ames_raw
names(ames_raw) <- make.names(names(ames_raw))
ames_raw   <- ames_raw %>% rename(Sale_Price = SalePrice)

# Stratified 75/25 split
ames_raw$price_q <- dplyr::ntile(ames_raw$Sale_Price, 4)
split      <- rsample::initial_split(ames_raw, prop = 0.75, strata = "price_q")
ames_train <- rsample::training(split)  %>% select(-price_q)
ames_test  <- rsample::testing(split)   %>% select(-price_q)

cat("Training rows :", nrow(ames_train), "\n")

## Training rows : 2196

cat("Test rows     :", nrow(ames_test),  "\n")

## Test rows     : 734

cat("Features      :", ncol(ames_train) - 1, "\n")

## Features      : 81

cat("Missing cells (train):", sum(is.na(ames_train)), "\n")

## Missing cells (train): 10548

3.2 Target Engineering

Although not always required, transforming the response variable can lead to predictive improvement, especially with parametric models that assume normally distributed errors.

3.2.1 Visualising the skewed response

p1 <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 70, fill = "#4472C4", colour = "white", linewidth = 0.2) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Original Sale Price",
       x = "Sale Price ($)", y = "Count") +
  theme_bw(base_size = 12)

p2 <- ggplot(ames_train, aes(x = log(Sale_Price))) +
  geom_histogram(bins = 70, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(title = "log(Sale Price)",
       x = "log(Sale Price)", y = "Count") +
  theme_bw(base_size = 12)

p1 + p2 +
  plot_annotation(
    title    = "Figure 3.1 - Transforming the response variable",
    subtitle = "Right-skewed original vs. approximately normal log-transform"
  )

Figure 3.1 - Right-skewed Sale_Price distribution

3.2.2 Option 1 - Log transformation

transformed_response <- log(ames_train$Sale_Price)

cat("Skewness (original)       :", round(e1071::skewness(ames_train$Sale_Price), 3), "\n")

## Skewness (original)       : 1.602

cat("Skewness (log-transformed):", round(e1071::skewness(transformed_response),  3), "\n")

## Skewness (log-transformed): -0.126

# Recipe blueprint (ensures consistent application to train and test)
ames_recipe_log <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe_log

Handling zeros and negatives with log1p

log(-0.5)       # NaN - cannot log a negative

## [1] NaN

log1p(-0.5)     # log(1 + x): safe when x >= -1

## [1] -0.6931472

3.2.3 Option 2 - Box-Cox transformation

# Fit lambda on TRAINING data only to prevent data leakage
lambda_bc <- forecast::BoxCox.lambda(ames_train$Sale_Price, method = "loglik")
cat("Optimal Box-Cox lambda:", round(lambda_bc, 4), "\n")

## Optimal Box-Cox lambda: 0.05

bc_train <- forecast::BoxCox(ames_train$Sale_Price, lambda_bc)

p_orig <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 70, fill = "#7F8C8D", colour = "white", linewidth = 0.2) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Original", x = "Sale Price ($)", y = "Count") +
  theme_bw(base_size = 11)

p_log <- ggplot(mapping = aes(x = log(ames_train$Sale_Price))) +
  geom_histogram(bins = 70, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(title = "log(Sale Price)", x = "log(Sale Price)", y = "Count") +
  theme_bw(base_size = 11)

p_bc <- ggplot(mapping = aes(x = bc_train)) +
  geom_histogram(bins = 70, fill = "#C0392B", colour = "white", linewidth = 0.2) +
  labs(title = paste0("Box-Cox (lambda = ", round(lambda_bc, 2), ")"),
       x = "Box-Cox(Sale Price)", y = "Count") +
  theme_bw(base_size = 11)

p_orig + p_log + p_bc +
  plot_annotation(
    title    = "Figure 3.2 - Comparing transformations of Sale_Price",
    subtitle = "Box-Cox finds the optimal power transformation automatically"
  )

Figure 3.2 - Box-Cox transformation of Sale Price

par(mfrow = c(1, 3), mar = c(4, 4, 3, 1))
qqnorm(ames_train$Sale_Price,
       main = "QQ - Original", col = "#7F8C8D", pch = 16, cex = 0.4)
qqline(ames_train$Sale_Price, col = "red", lwd = 2)

qqnorm(log(ames_train$Sale_Price),
       main = "QQ - log", col = "#27AE60", pch = 16, cex = 0.4)
qqline(log(ames_train$Sale_Price), col = "red", lwd = 2)

qqnorm(bc_train,
       main = paste0("QQ - Box-Cox (lam=", round(lambda_bc, 2), ")"),
       col = "#C0392B", pch = 16, cex = 0.4)
qqline(bc_train, col = "red", lwd = 2)

Figure 3.2b - Normal Q-Q plots

par(mfrow = c(1, 1))

# Recipe version - lambda estimated automatically from training data
ames_recipe_bc <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_BoxCox(all_outcomes())

ames_recipe_bc

Skewness summary table

data.frame(
  Transformation = c(
    "Original",
    "log(Sale Price)",
    paste0("Box-Cox (lambda = ", round(lambda_bc, 3), ")")
  ),
  Skewness = round(c(
    e1071::skewness(ames_train$Sale_Price),
    e1071::skewness(log(ames_train$Sale_Price)),
    e1071::skewness(bc_train)
  ), 4)
) |>
  knitr::kable(caption = "Table 3.1 - Skewness after each transformation")

Table 3.1 - Skewness after each transformation
Transformation	Skewness
Original	1.6016
log(Sale Price)	-0.1256
Box-Cox (lambda = 0.05)	-0.0165

3.3 Dealing with Missingness

3.3.1 Visualising missing values

miss_summary <- ames_train %>%
  summarise(across(everything(), ~ sum(is.na(.)) / n() * 100)) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "Feature",
                      values_to = "Pct_Missing") %>%
  filter(Pct_Missing > 0) %>%
  arrange(desc(Pct_Missing))

cat("Features with missing values:", nrow(miss_summary), "\n")

## Features with missing values: 26

cat("Total missing cells         :", sum(is.na(ames_train)), "\n")

## Total missing cells         : 10548

knitr::kable(
  head(miss_summary, 15),
  digits  = 2,
  caption = "Table 3.2 - Top 15 features by % missing (training set)"
)

Table 3.2 - Top 15 features by % missing (training set)
Feature	Pct_Missing
Pool.QC	99.59
Misc.Feature	96.72
Alley	93.17
Fence	80.92
Fireplace.Qu	48.82
Lot.Frontage	16.99
Garage.Yr.Blt	5.56
Garage.Finish	5.56
Garage.Qual	5.56
Garage.Cond	5.56
Garage.Type	5.46
Bsmt.Exposure	2.96
Bsmt.Qual	2.82
Bsmt.Cond	2.82
BsmtFin.Type.1	2.82

visdat::vis_dat(ames_train, sort_type = FALSE) +
  labs(title = "Figure 3.3 - Data types and missingness in ames_train")

Figure 3.3 - vis_dat overview of training data

visdat::vis_miss(ames_train, cluster = TRUE) +
  labs(title = "Figure 3.3b - Clustered missing-value pattern")

Figure 3.3b - vis_miss clustered plot

# Subset to only variables that actually have NAs before calling gg_miss_upset
miss_vars <- miss_summary %>% filter(Pct_Missing > 0) %>% pull(Feature)

if (length(miss_vars) >= 2) {
  naniar::gg_miss_upset(
    ames_train %>% dplyr::select(dplyr::all_of(miss_vars)),
    nsets = min(10, length(miss_vars))
  )
} else {
  cat("Not enough variables with missing data for an UpSet plot.\n")
}

Figure 3.3c - UpSet plot of co-occurring missing variables

miss_summary %>%
  ggplot(aes(x = reorder(Feature, Pct_Missing), y = Pct_Missing)) +
  geom_col(fill = "#4472C4") +
  geom_hline(yintercept = 5,  linetype = "dashed", colour = "orange",  linewidth = 0.7) +
  geom_hline(yintercept = 20, linetype = "dashed", colour = "red",     linewidth = 0.7) +
  coord_flip() +
  labs(title    = "Figure 3.3d - % Missing per feature (training set)",
       subtitle = "Dashed lines at 5% (orange) and 20% (red)",
       x = NULL, y = "Missing (%)") +
  theme_bw(base_size = 11)

Figure 3.3d - % missing per feature bar chart

3.3.2 Imputation

3.3.2.1 Estimated statistic (median / mode)

ames_recipe_imp_stat <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors())

ames_recipe_imp_stat

prep_stat  <- prep(ames_recipe_imp_stat, training = ames_train)
baked_stat <- bake(prep_stat, new_data = ames_train)

p_orig_lf <- ames_train %>%
  filter(!is.na(Lot.Frontage)) %>%
  ggplot(aes(x = Lot.Frontage)) +
  geom_histogram(bins = 40, fill = "#7F8C8D", colour = "white", linewidth = 0.2) +
  labs(title = "Original (non-missing only)", x = "Lot Frontage (ft)", y = "Count") +
  theme_bw(base_size = 11)

p_med_lf <- baked_stat %>%
  ggplot(aes(x = Lot.Frontage)) +
  geom_histogram(bins = 40, fill = "#4472C4", colour = "white", linewidth = 0.2) +
  labs(title = "After median imputation", x = "Lot Frontage (ft)", y = "Count") +
  theme_bw(base_size = 11)

p_orig_lf + p_med_lf +
  plot_annotation(title = "Figure 3.4a - Lot.Frontage: original vs. median-imputed")

Figure 3.4a - Median imputation for Lot.Frontage

3.3.2.2 K-Nearest Neighbour imputation

ames_recipe_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)

ames_recipe_knn

prep_knn  <- prep(ames_recipe_knn,  training = ames_train)
baked_knn <- bake(prep_knn, new_data = ames_train)

p_knn_lf <- baked_knn %>%
  ggplot(aes(x = Lot.Frontage)) +
  geom_histogram(bins = 40, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(title = "After KNN imputation (k = 6)", x = "Lot Frontage (ft)", y = "Count") +
  theme_bw(base_size = 11)

p_orig_lf + p_med_lf + p_knn_lf +
  plot_annotation(title = "Figure 3.4b - Imputation methods compared for Lot.Frontage")

Figure 3.4b - KNN vs median imputation for Lot.Frontage

3.3.2.3 Tree-based imputation (bagged trees)

ames_recipe_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_bag(all_predictors(), trees = 25)

ames_recipe_bag

3.4 Feature Filtering

Removing features with near-zero variance (NZV) reduces noise and computational cost.

Near-zero variance detection

nzv_metrics <- caret::nearZeroVar(ames_train, saveMetrics = TRUE)

knitr::kable(
  nzv_metrics %>%
    tibble::rownames_to_column("Feature") %>%
    filter(nzv == TRUE) %>%
    arrange(desc(percentUnique)) %>%
    head(20),
  digits  = 3,
  caption = "Table 3.3 - Near-zero variance features"
)

Table 3.3 - Near-zero variance features
Feature	freqRatio	percentUnique	zeroVar	nzv
Enclosed.Porch	109.941	7.286	FALSE	TRUE
Screen.Porch	166.917	4.690	FALSE	TRUE
Misc.Val	236.222	1.412	FALSE	TRUE
Low.Qual.Fin.SF	721.000	1.366	FALSE	TRUE
X3Ssn.Porch	724.000	0.956	FALSE	TRUE
Pool.Area	2187.000	0.455	FALSE	TRUE
Condition.2	197.364	0.364	FALSE	TRUE
Functional	38.491	0.364	FALSE	TRUE
Roof.Matl	135.438	0.319	FALSE	TRUE
BsmtFin.Type.2	24.312	0.273	FALSE	TRUE
Heating	98.091	0.273	FALSE	TRUE
Bsmt.Cond	23.747	0.228	FALSE	TRUE
Garage.Qual	22.318	0.228	FALSE	TRUE
Garage.Cond	36.944	0.228	FALSE	TRUE
Land.Contour	21.151	0.182	FALSE	TRUE
Kitchen.AbvGr	20.920	0.182	FALSE	TRUE
Utilities	1096.500	0.137	FALSE	TRUE
Land.Slope	20.624	0.137	FALSE	TRUE
Street	218.600	0.091	FALSE	TRUE

cat("Total NZV features     :", sum(nzv_metrics$nzv), "\n")

## Total NZV features     : 19

cat("Remaining after removal:", ncol(ames_train) - 1 - sum(nzv_metrics$nzv), "\n")

## Remaining after removal: 62

nzv_metrics %>%
  tibble::rownames_to_column("Feature") %>%
  filter(Feature != "Sale_Price") %>%
  ggplot(aes(x = reorder(Feature, freqRatio),
             y = log1p(freqRatio),
             fill = nzv)) +
  geom_col() +
  scale_fill_manual(
    values = c("TRUE" = "#C0392B", "FALSE" = "#4472C4"),
    name   = "Near-zero\nvariance"
  ) +
  coord_flip() +
  labs(
    title    = "Figure 3.5 - log(1 + Frequency Ratio) per feature",
    subtitle = "Red = flagged as near-zero variance",
    x = NULL, y = "log(1 + Frequency Ratio)"
  ) +
  theme_bw(base_size = 8) +
  theme(axis.text.y = element_text(size = 6))

Figure 3.5 - Frequency ratio per feature

NZV removal via recipes

ames_recipe_nzv <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

prep_nzv  <- prep(ames_recipe_nzv, training = ames_train)
baked_nzv <- bake(prep_nzv, new_data = ames_train)

cat("Features before NZV removal:", ncol(ames_train) - 1, "\n")

## Features before NZV removal: 81

cat("Features after  NZV removal:", ncol(baked_nzv)  - 1, "\n")

## Features after  NZV removal: 58

Correlation-based filtering

num_baked <- baked_nzv %>% select(where(is.numeric))

top20_names <- cor(num_baked)[, "Sale_Price"] %>%
  abs() %>%
  sort(decreasing = TRUE) %>%
  names() %>%
  .[. != "Sale_Price"] %>%
  head(20)

cor_mat <- cor(num_baked %>% select(all_of(top20_names)))

high_cor_idx <- caret::findCorrelation(cor_mat, cutoff = 0.80)
cat("Features to remove (|r| > 0.80):", length(high_cor_idx), "\n")

## Features to remove (|r| > 0.80): 2

corrplot::corrplot(
  cor_mat,
  method  = "color",
  type    = "lower",
  tl.cex  = 0.75,
  tl.col  = "black",
  diag    = FALSE,
  title   = "Figure 3.5b - Correlation: top-20 numeric predictors",
  mar     = c(0, 0, 2, 0)
)

Figure 3.5b - Correlation heatmap of top 20 numeric predictors

3.5 Numeric Feature Engineering

3.5.1 Skewness

Skewed predictors can violate model assumptions and hurt distance-based algorithms.

skew_vals <- num_baked %>%
  select(-Sale_Price) %>%
  summarise(across(everything(),
                   ~ e1071::skewness(., na.rm = TRUE))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "Feature",
                      values_to = "Skewness") %>%
  arrange(desc(abs(Skewness)))

knitr::kable(
  head(skew_vals, 15),
  digits  = 3,
  caption = "Table 3.4 - Top 15 most skewed numeric predictors"
)

Table 3.4 - Top 15 most skewed numeric predictors
Feature	Skewness
Lot.Area	13.461
Bsmt.Half.Bath	4.168
BsmtFin.SF.2	4.045
Mas.Vnr.Area	2.673
Open.Porch.SF	2.609
Lot.Frontage	1.898
Wood.Deck.SF	1.837
BsmtFin.SF.1	1.455
X1st.Flr.SF	1.279
Gr.Liv.Area	1.216
Total.Bsmt.SF	1.058
Bsmt.Unf.SF	0.952
X2nd.Flr.SF	0.868
TotRms.AbvGrd	0.739
Half.Bath	0.729

top10_skew <- head(skew_vals, 10)$Feature

num_baked %>%
  select(all_of(top10_skew)) %>%
  tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value)) +
  geom_histogram(bins = 40, fill = "#4472C4", colour = "white", linewidth = 0.2) +
  facet_wrap(~ Feature, scales = "free", ncol = 5) +
  labs(
    title    = "Figure 3.6 - Top-10 most skewed numeric predictors",
    subtitle = "Scales are free per panel",
    x = NULL, y = "Count"
  ) +
  theme_bw(base_size = 9) +
  theme(strip.text = element_text(size = 8, face = "bold"))

Figure 3.6 - Distributions of 10 most skewed features

Correcting skewness with Box-Cox

most_skewed <- skew_vals$Feature[1]
cat("Most skewed feature:", most_skewed, "\n")

## Most skewed feature: Lot.Area

p_before <- num_baked %>%
  ggplot(aes(x = .data[[most_skewed]])) +
  geom_histogram(bins = 50, fill = "#7F8C8D", colour = "white", linewidth = 0.2) +
  labs(
    title = paste0("Before: ", most_skewed,
                   " (skew = ",
                   round(e1071::skewness(num_baked[[most_skewed]], na.rm = TRUE), 2), ")"),
    x = most_skewed, y = "Count"
  ) +
  theme_bw(base_size = 11)

bc_pred_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_BoxCox(all_numeric_predictors())

bc_prep  <- prep(bc_pred_recipe, training = ames_train)
bc_baked <- bake(bc_prep, new_data = ames_train)

p_after <- bc_baked %>%
  ggplot(aes(x = .data[[most_skewed]])) +
  geom_histogram(bins = 50, fill = "#C0392B", colour = "white", linewidth = 0.2) +
  labs(
    title = paste0("After Box-Cox (skew = ",
                   round(e1071::skewness(bc_baked[[most_skewed]], na.rm = TRUE), 2), ")"),
    x = paste0("BoxCox(", most_skewed, ")"), y = "Count"
  ) +
  theme_bw(base_size = 11)

p_before + p_after +
  plot_annotation(
    title = paste0("Figure 3.7 - Box-Cox transformation of ", most_skewed)
  )

Figure 3.7 - Box-Cox on most skewed predictor

Yeo-Johnson (handles zeros and negatives)

yj_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_YeoJohnson(all_numeric_predictors())

yj_prep  <- prep(yj_recipe,  training = ames_train)
yj_baked <- bake(yj_prep,    new_data = ames_train)

p_yj <- yj_baked %>%
  ggplot(aes(x = .data[[most_skewed]])) +
  geom_histogram(bins = 50, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(
    title = paste0("Yeo-Johnson (skew = ",
                   round(e1071::skewness(yj_baked[[most_skewed]], na.rm = TRUE), 2), ")"),
    x = paste0("YJ(", most_skewed, ")"), y = "Count"
  ) +
  theme_bw(base_size = 11)

p_before + p_after + p_yj +
  plot_annotation(
    title = paste0("Figure 3.7b - Box-Cox vs. Yeo-Johnson on ", most_skewed)
  )

Figure 3.7b - Yeo-Johnson vs Box-Cox comparison

3.5.2 Standardisation

Many ML algorithms require features on a comparable scale. Z-score standardisation: \(z_i = (x_i - \bar{x}) / s\)

feats4 <- c("Lot.Area", "Gr.Liv.Area", "Year.Built", "Garage.Area")
feats4 <- feats4[feats4 %in% names(baked_nzv)]

p_before_scale <- baked_nzv %>%
  select(all_of(feats4)) %>%
  tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Feature)) +
  geom_histogram(bins = 40, colour = "white", linewidth = 0.2, show.legend = FALSE) +
  facet_wrap(~ Feature, scales = "free", ncol = 2) +
  labs(title = "Before standardisation", x = NULL, y = "Count") +
  theme_bw(base_size = 10)

# step_normalize = step_center + step_scale in one call
norm_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_nzv(all_predictors()) %>%
  step_normalize(all_numeric_predictors())

norm_prep  <- prep(norm_recipe, training = ames_train)
norm_baked <- bake(norm_prep,   new_data = ames_train)

feats4_norm <- feats4[feats4 %in% names(norm_baked)]

p_after_scale <- norm_baked %>%
  select(all_of(feats4_norm)) %>%
  tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Feature)) +
  geom_histogram(bins = 40, colour = "white", linewidth = 0.2, show.legend = FALSE) +
  facet_wrap(~ Feature, scales = "free", ncol = 2) +
  labs(title = "After standardisation (mean = 0, sd = 1)", x = NULL, y = "Count") +
  theme_bw(base_size = 10)

p_before_scale / p_after_scale +
  plot_annotation(
    title    = "Figure 3.8 - Effect of z-score standardisation",
    subtitle = "Top: raw scale  |  Bottom: standardised"
  )

Figure 3.8 - Before and after standardisation

norm_baked %>%
  select(all_of(feats4_norm)) %>%
  summarise(across(everything(),
                   list(mean = ~ round(mean(., na.rm = TRUE), 4),
                        sd   = ~ round(sd(.,   na.rm = TRUE), 4)))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = c("Feature", ".value"),
                      names_sep = "_(?=[^_]+$)") %>%
  knitr::kable(
    caption = "Table 3.5 - Means and SDs after normalisation (expect ~0 and ~1)"
  )

Table 3.5 - Means and SDs after normalisation (expect ~0 and ~1)
Feature	mean	sd
Lot.Area	0	1
Gr.Liv.Area	0	1
Year.Built	0	1
Garage.Area	0	1

bind_rows(
  baked_nzv %>%
    select(all_of(feats4)) %>%
    tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
    mutate(Stage = "Before"),
  norm_baked %>%
    select(all_of(feats4_norm)) %>%
    tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
    mutate(Stage = "After (z-score)")
) %>%
  mutate(Stage = factor(Stage, levels = c("Before", "After (z-score)"))) %>%
  ggplot(aes(x = Stage, y = Value, fill = Stage)) +
  geom_violin(alpha = 0.7, trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.size = 0.6, alpha = 0.5) +
  scale_fill_manual(values = c("Before" = "#4472C4", "After (z-score)" = "#27AE60")) +
  facet_wrap(~ Feature, scales = "free_y", ncol = 2) +
  labs(
    title = "Figure 3.8b - Violin plots before & after standardisation",
    x = NULL, y = "Value", fill = NULL
  ) +
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom",
        strip.text      = element_text(face = "bold"))

Figure 3.8b - Violin plots before and after standardisation

Summary

Summary - Key functions per section
Section	Topic	Key_Recipe_Steps
3.1	Prerequisites	initial_split()
3.2	Target engineering	step_log(), step_BoxCox(), step_YeoJohnson()
3.3	Dealing with missingness	step_impute_median/mode(), step_impute_knn(), step_impute_bag()
3.4	Feature filtering	step_nzv(), findCorrelation()
3.5	Numeric feature engineering	step_BoxCox(), step_YeoJohnson(), step_normalize()

How to publish to RPubs

After knitting this document to HTML in RStudio:

Click the Publish button (blue icon, top-right of preview window)
Select RPubs
Sign in at rpubs.com (free account)
Give it a title and click Publish

Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.3.1       scales_1.4.0      forecast_8.24.0   corrplot_0.95    
##  [5] e1071_1.7-17      naniar_1.1.0      patchwork_1.3.2   rsample_1.3.1    
##  [9] AmesHousing_0.0.4 recipes_1.3.1     caret_7.0-1       lattice_0.22-7   
## [13] visdat_0.6.0      ggplot2_4.0.0     dplyr_1.1.4      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.6          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] knitr_1.50           labeling_0.4.3       curl_7.0.0          
## [22] TTR_0.24.4           plyr_1.8.9           RColorBrewer_1.1-3  
## [25] withr_3.0.2          purrr_1.1.0          nnet_7.3-20         
## [28] grid_4.5.1           stats4_4.5.1         sparsevctrs_0.3.4   
## [31] xts_0.14.1           colorspace_2.1-2     future_1.67.0       
## [34] globals_0.18.0       iterators_1.0.14     MASS_7.3-65         
## [37] cli_3.6.5            UpSetR_1.4.0         rmarkdown_2.29      
## [40] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.0 
## [43] reshape2_1.4.5       cachem_1.1.0         proxy_0.4-29        
## [46] stringr_1.5.2        splines_4.5.1        parallel_4.5.1      
## [49] urca_1.3-4           vctrs_0.6.5          hardhat_1.4.2       
## [52] Matrix_1.7-3         jsonlite_2.0.0       tseries_0.10-58     
## [55] listenv_0.9.1        foreach_1.5.2        gower_1.0.2         
## [58] jquerylib_0.1.4      quantmod_0.4.28      glue_1.8.0          
## [61] parallelly_1.45.1    codetools_0.2-20     lubridate_1.9.4     
## [64] stringi_1.8.7        gtable_0.3.6         quadprog_1.5-8      
## [67] lmtest_0.9-40        tibble_3.3.0         pillar_1.11.1       
## [70] furrr_0.3.1          htmltools_0.5.8.1    ipred_0.9-15        
## [73] lava_1.8.1           R6_2.6.1             evaluate_1.0.5      
## [76] fracdiff_1.5-3       bslib_0.9.0          class_7.3-23        
## [79] Rcpp_1.1.0           gridExtra_2.3        nlme_3.1-168        
## [82] prodlim_2025.04.28   xfun_0.52            zoo_1.8-14          
## [85] ModelMetrics_1.2.2.2 pkgconfig_2.0.3

Source: bradleyboehmke.github.io/HOML/engineering.html

HOML Chapter 3 – Feature & Target Engineering

Sections 3.1 – 3.5 | Hands-On Machine Learning with R

Reproduced from bradleyboehmke.github.io/HOML

2026-03-16