Overview

This document reproduces all code outputs from Chapter 3: Feature & Target Engineering (Sections 3.1 – 3.5) of the online book Hands-On Machine Learning with R by Bradley Boehmke & Brandon Greenwell.

Note on the dataset: AmesHousing::make_ames() pre-cleans all missing values. To faithfully demonstrate Section 3.3, we use the raw Ames data (AmesHousing::ames_raw) which retains real NAs, then split it for training/testing.

Section Topic
3.1 Prerequisites
3.2 Target engineering
3.3 Dealing with missingness
3.4 Feature filtering
3.5 Numeric feature engineering

3.1 Prerequisites

# Helper packages
library(dplyr)       # data manipulation
library(ggplot2)     # graphics
library(visdat)      # missing value visualisation

# Feature engineering packages
library(caret)       # ML tasks
library(recipes)     # feature engineering blueprints

# Supporting packages
library(AmesHousing) # Ames housing dataset (raw + cleaned)
library(rsample)     # train/test splitting
library(patchwork)   # combining ggplots
library(naniar)      # missing data tools
library(e1071)       # skewness()
library(corrplot)    # correlation heatmap
library(forecast)    # BoxCox.lambda()
library(scales)      # dollar labels
library(tidyr)       # pivot_longer()
# Use RAW Ames data so missing values are preserved for Section 3.3
set.seed(123)

ames_raw   <- AmesHousing::ames_raw
names(ames_raw) <- make.names(names(ames_raw))
ames_raw   <- ames_raw %>% rename(Sale_Price = SalePrice)

# Stratified 75/25 split
ames_raw$price_q <- dplyr::ntile(ames_raw$Sale_Price, 4)
split      <- rsample::initial_split(ames_raw, prop = 0.75, strata = "price_q")
ames_train <- rsample::training(split)  %>% select(-price_q)
ames_test  <- rsample::testing(split)   %>% select(-price_q)

cat("Training rows :", nrow(ames_train), "\n")
## Training rows : 2196
cat("Test rows     :", nrow(ames_test),  "\n")
## Test rows     : 734
cat("Features      :", ncol(ames_train) - 1, "\n")
## Features      : 81
cat("Missing cells (train):", sum(is.na(ames_train)), "\n")
## Missing cells (train): 10548

3.2 Target Engineering

Although not always required, transforming the response variable can lead to predictive improvement, especially with parametric models that assume normally distributed errors.

3.2.1 Visualising the skewed response

p1 <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 70, fill = "#4472C4", colour = "white", linewidth = 0.2) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Original Sale Price",
       x = "Sale Price ($)", y = "Count") +
  theme_bw(base_size = 12)

p2 <- ggplot(ames_train, aes(x = log(Sale_Price))) +
  geom_histogram(bins = 70, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(title = "log(Sale Price)",
       x = "log(Sale Price)", y = "Count") +
  theme_bw(base_size = 12)

p1 + p2 +
  plot_annotation(
    title    = "Figure 3.1 - Transforming the response variable",
    subtitle = "Right-skewed original vs. approximately normal log-transform"
  )
Figure 3.1 - Right-skewed Sale_Price distribution

Figure 3.1 - Right-skewed Sale_Price distribution

3.2.2 Option 1 - Log transformation

transformed_response <- log(ames_train$Sale_Price)

cat("Skewness (original)       :", round(e1071::skewness(ames_train$Sale_Price), 3), "\n")
## Skewness (original)       : 1.602
cat("Skewness (log-transformed):", round(e1071::skewness(transformed_response),  3), "\n")
## Skewness (log-transformed): -0.126
# Recipe blueprint (ensures consistent application to train and test)
ames_recipe_log <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe_log

Handling zeros and negatives with log1p

log(-0.5)       # NaN - cannot log a negative
## [1] NaN
log1p(-0.5)     # log(1 + x): safe when x >= -1
## [1] -0.6931472

3.2.3 Option 2 - Box-Cox transformation

# Fit lambda on TRAINING data only to prevent data leakage
lambda_bc <- forecast::BoxCox.lambda(ames_train$Sale_Price, method = "loglik")
cat("Optimal Box-Cox lambda:", round(lambda_bc, 4), "\n")
## Optimal Box-Cox lambda: 0.05
bc_train <- forecast::BoxCox(ames_train$Sale_Price, lambda_bc)

p_orig <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 70, fill = "#7F8C8D", colour = "white", linewidth = 0.2) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Original", x = "Sale Price ($)", y = "Count") +
  theme_bw(base_size = 11)

p_log <- ggplot(mapping = aes(x = log(ames_train$Sale_Price))) +
  geom_histogram(bins = 70, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(title = "log(Sale Price)", x = "log(Sale Price)", y = "Count") +
  theme_bw(base_size = 11)

p_bc <- ggplot(mapping = aes(x = bc_train)) +
  geom_histogram(bins = 70, fill = "#C0392B", colour = "white", linewidth = 0.2) +
  labs(title = paste0("Box-Cox (lambda = ", round(lambda_bc, 2), ")"),
       x = "Box-Cox(Sale Price)", y = "Count") +
  theme_bw(base_size = 11)

p_orig + p_log + p_bc +
  plot_annotation(
    title    = "Figure 3.2 - Comparing transformations of Sale_Price",
    subtitle = "Box-Cox finds the optimal power transformation automatically"
  )
Figure 3.2 - Box-Cox transformation of Sale Price

Figure 3.2 - Box-Cox transformation of Sale Price

par(mfrow = c(1, 3), mar = c(4, 4, 3, 1))
qqnorm(ames_train$Sale_Price,
       main = "QQ - Original", col = "#7F8C8D", pch = 16, cex = 0.4)
qqline(ames_train$Sale_Price, col = "red", lwd = 2)

qqnorm(log(ames_train$Sale_Price),
       main = "QQ - log", col = "#27AE60", pch = 16, cex = 0.4)
qqline(log(ames_train$Sale_Price), col = "red", lwd = 2)

qqnorm(bc_train,
       main = paste0("QQ - Box-Cox (lam=", round(lambda_bc, 2), ")"),
       col = "#C0392B", pch = 16, cex = 0.4)
qqline(bc_train, col = "red", lwd = 2)
Figure 3.2b - Normal Q-Q plots

Figure 3.2b - Normal Q-Q plots

par(mfrow = c(1, 1))
# Recipe version - lambda estimated automatically from training data
ames_recipe_bc <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_BoxCox(all_outcomes())

ames_recipe_bc

Skewness summary table

data.frame(
  Transformation = c(
    "Original",
    "log(Sale Price)",
    paste0("Box-Cox (lambda = ", round(lambda_bc, 3), ")")
  ),
  Skewness = round(c(
    e1071::skewness(ames_train$Sale_Price),
    e1071::skewness(log(ames_train$Sale_Price)),
    e1071::skewness(bc_train)
  ), 4)
) |>
  knitr::kable(caption = "Table 3.1 - Skewness after each transformation")
Table 3.1 - Skewness after each transformation
Transformation Skewness
Original 1.6016
log(Sale Price) -0.1256
Box-Cox (lambda = 0.05) -0.0165

3.3 Dealing with Missingness

3.3.1 Visualising missing values

miss_summary <- ames_train %>%
  summarise(across(everything(), ~ sum(is.na(.)) / n() * 100)) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "Feature",
                      values_to = "Pct_Missing") %>%
  filter(Pct_Missing > 0) %>%
  arrange(desc(Pct_Missing))

cat("Features with missing values:", nrow(miss_summary), "\n")
## Features with missing values: 26
cat("Total missing cells         :", sum(is.na(ames_train)), "\n")
## Total missing cells         : 10548
knitr::kable(
  head(miss_summary, 15),
  digits  = 2,
  caption = "Table 3.2 - Top 15 features by % missing (training set)"
)
Table 3.2 - Top 15 features by % missing (training set)
Feature Pct_Missing
Pool.QC 99.59
Misc.Feature 96.72
Alley 93.17
Fence 80.92
Fireplace.Qu 48.82
Lot.Frontage 16.99
Garage.Yr.Blt 5.56
Garage.Finish 5.56
Garage.Qual 5.56
Garage.Cond 5.56
Garage.Type 5.46
Bsmt.Exposure 2.96
Bsmt.Qual 2.82
Bsmt.Cond 2.82
BsmtFin.Type.1 2.82
visdat::vis_dat(ames_train, sort_type = FALSE) +
  labs(title = "Figure 3.3 - Data types and missingness in ames_train")
Figure 3.3 - vis_dat overview of training data

Figure 3.3 - vis_dat overview of training data

visdat::vis_miss(ames_train, cluster = TRUE) +
  labs(title = "Figure 3.3b - Clustered missing-value pattern")
Figure 3.3b - vis_miss clustered plot

Figure 3.3b - vis_miss clustered plot

# Subset to only variables that actually have NAs before calling gg_miss_upset
miss_vars <- miss_summary %>% filter(Pct_Missing > 0) %>% pull(Feature)

if (length(miss_vars) >= 2) {
  naniar::gg_miss_upset(
    ames_train %>% dplyr::select(dplyr::all_of(miss_vars)),
    nsets = min(10, length(miss_vars))
  )
} else {
  cat("Not enough variables with missing data for an UpSet plot.\n")
}
Figure 3.3c - UpSet plot of co-occurring missing variables

Figure 3.3c - UpSet plot of co-occurring missing variables

miss_summary %>%
  ggplot(aes(x = reorder(Feature, Pct_Missing), y = Pct_Missing)) +
  geom_col(fill = "#4472C4") +
  geom_hline(yintercept = 5,  linetype = "dashed", colour = "orange",  linewidth = 0.7) +
  geom_hline(yintercept = 20, linetype = "dashed", colour = "red",     linewidth = 0.7) +
  coord_flip() +
  labs(title    = "Figure 3.3d - % Missing per feature (training set)",
       subtitle = "Dashed lines at 5% (orange) and 20% (red)",
       x = NULL, y = "Missing (%)") +
  theme_bw(base_size = 11)
Figure 3.3d - % missing per feature bar chart

Figure 3.3d - % missing per feature bar chart

3.3.2 Imputation

3.3.2.1 Estimated statistic (median / mode)

ames_recipe_imp_stat <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors())

ames_recipe_imp_stat
prep_stat  <- prep(ames_recipe_imp_stat, training = ames_train)
baked_stat <- bake(prep_stat, new_data = ames_train)

p_orig_lf <- ames_train %>%
  filter(!is.na(Lot.Frontage)) %>%
  ggplot(aes(x = Lot.Frontage)) +
  geom_histogram(bins = 40, fill = "#7F8C8D", colour = "white", linewidth = 0.2) +
  labs(title = "Original (non-missing only)", x = "Lot Frontage (ft)", y = "Count") +
  theme_bw(base_size = 11)

p_med_lf <- baked_stat %>%
  ggplot(aes(x = Lot.Frontage)) +
  geom_histogram(bins = 40, fill = "#4472C4", colour = "white", linewidth = 0.2) +
  labs(title = "After median imputation", x = "Lot Frontage (ft)", y = "Count") +
  theme_bw(base_size = 11)

p_orig_lf + p_med_lf +
  plot_annotation(title = "Figure 3.4a - Lot.Frontage: original vs. median-imputed")
Figure 3.4a - Median imputation for Lot.Frontage

Figure 3.4a - Median imputation for Lot.Frontage

3.3.2.2 K-Nearest Neighbour imputation

ames_recipe_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)

ames_recipe_knn
prep_knn  <- prep(ames_recipe_knn,  training = ames_train)
baked_knn <- bake(prep_knn, new_data = ames_train)

p_knn_lf <- baked_knn %>%
  ggplot(aes(x = Lot.Frontage)) +
  geom_histogram(bins = 40, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(title = "After KNN imputation (k = 6)", x = "Lot Frontage (ft)", y = "Count") +
  theme_bw(base_size = 11)

p_orig_lf + p_med_lf + p_knn_lf +
  plot_annotation(title = "Figure 3.4b - Imputation methods compared for Lot.Frontage")
Figure 3.4b - KNN vs median imputation for Lot.Frontage

Figure 3.4b - KNN vs median imputation for Lot.Frontage

3.3.2.3 Tree-based imputation (bagged trees)

ames_recipe_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_bag(all_predictors(), trees = 25)

ames_recipe_bag

3.4 Feature Filtering

Removing features with near-zero variance (NZV) reduces noise and computational cost.

Near-zero variance detection

nzv_metrics <- caret::nearZeroVar(ames_train, saveMetrics = TRUE)

knitr::kable(
  nzv_metrics %>%
    tibble::rownames_to_column("Feature") %>%
    filter(nzv == TRUE) %>%
    arrange(desc(percentUnique)) %>%
    head(20),
  digits  = 3,
  caption = "Table 3.3 - Near-zero variance features"
)
Table 3.3 - Near-zero variance features
Feature freqRatio percentUnique zeroVar nzv
Enclosed.Porch 109.941 7.286 FALSE TRUE
Screen.Porch 166.917 4.690 FALSE TRUE
Misc.Val 236.222 1.412 FALSE TRUE
Low.Qual.Fin.SF 721.000 1.366 FALSE TRUE
X3Ssn.Porch 724.000 0.956 FALSE TRUE
Pool.Area 2187.000 0.455 FALSE TRUE
Condition.2 197.364 0.364 FALSE TRUE
Functional 38.491 0.364 FALSE TRUE
Roof.Matl 135.438 0.319 FALSE TRUE
BsmtFin.Type.2 24.312 0.273 FALSE TRUE
Heating 98.091 0.273 FALSE TRUE
Bsmt.Cond 23.747 0.228 FALSE TRUE
Garage.Qual 22.318 0.228 FALSE TRUE
Garage.Cond 36.944 0.228 FALSE TRUE
Land.Contour 21.151 0.182 FALSE TRUE
Kitchen.AbvGr 20.920 0.182 FALSE TRUE
Utilities 1096.500 0.137 FALSE TRUE
Land.Slope 20.624 0.137 FALSE TRUE
Street 218.600 0.091 FALSE TRUE
cat("Total NZV features     :", sum(nzv_metrics$nzv), "\n")
## Total NZV features     : 19
cat("Remaining after removal:", ncol(ames_train) - 1 - sum(nzv_metrics$nzv), "\n")
## Remaining after removal: 62
nzv_metrics %>%
  tibble::rownames_to_column("Feature") %>%
  filter(Feature != "Sale_Price") %>%
  ggplot(aes(x = reorder(Feature, freqRatio),
             y = log1p(freqRatio),
             fill = nzv)) +
  geom_col() +
  scale_fill_manual(
    values = c("TRUE" = "#C0392B", "FALSE" = "#4472C4"),
    name   = "Near-zero\nvariance"
  ) +
  coord_flip() +
  labs(
    title    = "Figure 3.5 - log(1 + Frequency Ratio) per feature",
    subtitle = "Red = flagged as near-zero variance",
    x = NULL, y = "log(1 + Frequency Ratio)"
  ) +
  theme_bw(base_size = 8) +
  theme(axis.text.y = element_text(size = 6))
Figure 3.5 - Frequency ratio per feature

Figure 3.5 - Frequency ratio per feature

NZV removal via recipes

ames_recipe_nzv <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_nzv(all_predictors())

prep_nzv  <- prep(ames_recipe_nzv, training = ames_train)
baked_nzv <- bake(prep_nzv, new_data = ames_train)

cat("Features before NZV removal:", ncol(ames_train) - 1, "\n")
## Features before NZV removal: 81
cat("Features after  NZV removal:", ncol(baked_nzv)  - 1, "\n")
## Features after  NZV removal: 58

Correlation-based filtering

num_baked <- baked_nzv %>% select(where(is.numeric))

top20_names <- cor(num_baked)[, "Sale_Price"] %>%
  abs() %>%
  sort(decreasing = TRUE) %>%
  names() %>%
  .[. != "Sale_Price"] %>%
  head(20)

cor_mat <- cor(num_baked %>% select(all_of(top20_names)))

high_cor_idx <- caret::findCorrelation(cor_mat, cutoff = 0.80)
cat("Features to remove (|r| > 0.80):", length(high_cor_idx), "\n")
## Features to remove (|r| > 0.80): 2
corrplot::corrplot(
  cor_mat,
  method  = "color",
  type    = "lower",
  tl.cex  = 0.75,
  tl.col  = "black",
  diag    = FALSE,
  title   = "Figure 3.5b - Correlation: top-20 numeric predictors",
  mar     = c(0, 0, 2, 0)
)
Figure 3.5b - Correlation heatmap of top 20 numeric predictors

Figure 3.5b - Correlation heatmap of top 20 numeric predictors


3.5 Numeric Feature Engineering

3.5.1 Skewness

Skewed predictors can violate model assumptions and hurt distance-based algorithms.

skew_vals <- num_baked %>%
  select(-Sale_Price) %>%
  summarise(across(everything(),
                   ~ e1071::skewness(., na.rm = TRUE))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "Feature",
                      values_to = "Skewness") %>%
  arrange(desc(abs(Skewness)))

knitr::kable(
  head(skew_vals, 15),
  digits  = 3,
  caption = "Table 3.4 - Top 15 most skewed numeric predictors"
)
Table 3.4 - Top 15 most skewed numeric predictors
Feature Skewness
Lot.Area 13.461
Bsmt.Half.Bath 4.168
BsmtFin.SF.2 4.045
Mas.Vnr.Area 2.673
Open.Porch.SF 2.609
Lot.Frontage 1.898
Wood.Deck.SF 1.837
BsmtFin.SF.1 1.455
X1st.Flr.SF 1.279
Gr.Liv.Area 1.216
Total.Bsmt.SF 1.058
Bsmt.Unf.SF 0.952
X2nd.Flr.SF 0.868
TotRms.AbvGrd 0.739
Half.Bath 0.729
top10_skew <- head(skew_vals, 10)$Feature

num_baked %>%
  select(all_of(top10_skew)) %>%
  tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value)) +
  geom_histogram(bins = 40, fill = "#4472C4", colour = "white", linewidth = 0.2) +
  facet_wrap(~ Feature, scales = "free", ncol = 5) +
  labs(
    title    = "Figure 3.6 - Top-10 most skewed numeric predictors",
    subtitle = "Scales are free per panel",
    x = NULL, y = "Count"
  ) +
  theme_bw(base_size = 9) +
  theme(strip.text = element_text(size = 8, face = "bold"))
Figure 3.6 - Distributions of 10 most skewed features

Figure 3.6 - Distributions of 10 most skewed features

Correcting skewness with Box-Cox

most_skewed <- skew_vals$Feature[1]
cat("Most skewed feature:", most_skewed, "\n")
## Most skewed feature: Lot.Area
p_before <- num_baked %>%
  ggplot(aes(x = .data[[most_skewed]])) +
  geom_histogram(bins = 50, fill = "#7F8C8D", colour = "white", linewidth = 0.2) +
  labs(
    title = paste0("Before: ", most_skewed,
                   " (skew = ",
                   round(e1071::skewness(num_baked[[most_skewed]], na.rm = TRUE), 2), ")"),
    x = most_skewed, y = "Count"
  ) +
  theme_bw(base_size = 11)

bc_pred_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_BoxCox(all_numeric_predictors())

bc_prep  <- prep(bc_pred_recipe, training = ames_train)
bc_baked <- bake(bc_prep, new_data = ames_train)

p_after <- bc_baked %>%
  ggplot(aes(x = .data[[most_skewed]])) +
  geom_histogram(bins = 50, fill = "#C0392B", colour = "white", linewidth = 0.2) +
  labs(
    title = paste0("After Box-Cox (skew = ",
                   round(e1071::skewness(bc_baked[[most_skewed]], na.rm = TRUE), 2), ")"),
    x = paste0("BoxCox(", most_skewed, ")"), y = "Count"
  ) +
  theme_bw(base_size = 11)

p_before + p_after +
  plot_annotation(
    title = paste0("Figure 3.7 - Box-Cox transformation of ", most_skewed)
  )
Figure 3.7 - Box-Cox on most skewed predictor

Figure 3.7 - Box-Cox on most skewed predictor

Yeo-Johnson (handles zeros and negatives)

yj_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_YeoJohnson(all_numeric_predictors())

yj_prep  <- prep(yj_recipe,  training = ames_train)
yj_baked <- bake(yj_prep,    new_data = ames_train)

p_yj <- yj_baked %>%
  ggplot(aes(x = .data[[most_skewed]])) +
  geom_histogram(bins = 50, fill = "#27AE60", colour = "white", linewidth = 0.2) +
  labs(
    title = paste0("Yeo-Johnson (skew = ",
                   round(e1071::skewness(yj_baked[[most_skewed]], na.rm = TRUE), 2), ")"),
    x = paste0("YJ(", most_skewed, ")"), y = "Count"
  ) +
  theme_bw(base_size = 11)

p_before + p_after + p_yj +
  plot_annotation(
    title = paste0("Figure 3.7b - Box-Cox vs. Yeo-Johnson on ", most_skewed)
  )
Figure 3.7b - Yeo-Johnson vs Box-Cox comparison

Figure 3.7b - Yeo-Johnson vs Box-Cox comparison

3.5.2 Standardisation

Many ML algorithms require features on a comparable scale. Z-score standardisation: \(z_i = (x_i - \bar{x}) / s\)

feats4 <- c("Lot.Area", "Gr.Liv.Area", "Year.Built", "Garage.Area")
feats4 <- feats4[feats4 %in% names(baked_nzv)]

p_before_scale <- baked_nzv %>%
  select(all_of(feats4)) %>%
  tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Feature)) +
  geom_histogram(bins = 40, colour = "white", linewidth = 0.2, show.legend = FALSE) +
  facet_wrap(~ Feature, scales = "free", ncol = 2) +
  labs(title = "Before standardisation", x = NULL, y = "Count") +
  theme_bw(base_size = 10)

# step_normalize = step_center + step_scale in one call
norm_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_nzv(all_predictors()) %>%
  step_normalize(all_numeric_predictors())

norm_prep  <- prep(norm_recipe, training = ames_train)
norm_baked <- bake(norm_prep,   new_data = ames_train)

feats4_norm <- feats4[feats4 %in% names(norm_baked)]

p_after_scale <- norm_baked %>%
  select(all_of(feats4_norm)) %>%
  tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Feature)) +
  geom_histogram(bins = 40, colour = "white", linewidth = 0.2, show.legend = FALSE) +
  facet_wrap(~ Feature, scales = "free", ncol = 2) +
  labs(title = "After standardisation (mean = 0, sd = 1)", x = NULL, y = "Count") +
  theme_bw(base_size = 10)

p_before_scale / p_after_scale +
  plot_annotation(
    title    = "Figure 3.8 - Effect of z-score standardisation",
    subtitle = "Top: raw scale  |  Bottom: standardised"
  )
Figure 3.8 - Before and after standardisation

Figure 3.8 - Before and after standardisation

norm_baked %>%
  select(all_of(feats4_norm)) %>%
  summarise(across(everything(),
                   list(mean = ~ round(mean(., na.rm = TRUE), 4),
                        sd   = ~ round(sd(.,   na.rm = TRUE), 4)))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = c("Feature", ".value"),
                      names_sep = "_(?=[^_]+$)") %>%
  knitr::kable(
    caption = "Table 3.5 - Means and SDs after normalisation (expect ~0 and ~1)"
  )
Table 3.5 - Means and SDs after normalisation (expect ~0 and ~1)
Feature mean sd
Lot.Area 0 1
Gr.Liv.Area 0 1
Year.Built 0 1
Garage.Area 0 1
bind_rows(
  baked_nzv %>%
    select(all_of(feats4)) %>%
    tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
    mutate(Stage = "Before"),
  norm_baked %>%
    select(all_of(feats4_norm)) %>%
    tidyr::pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
    mutate(Stage = "After (z-score)")
) %>%
  mutate(Stage = factor(Stage, levels = c("Before", "After (z-score)"))) %>%
  ggplot(aes(x = Stage, y = Value, fill = Stage)) +
  geom_violin(alpha = 0.7, trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.size = 0.6, alpha = 0.5) +
  scale_fill_manual(values = c("Before" = "#4472C4", "After (z-score)" = "#27AE60")) +
  facet_wrap(~ Feature, scales = "free_y", ncol = 2) +
  labs(
    title = "Figure 3.8b - Violin plots before & after standardisation",
    x = NULL, y = "Value", fill = NULL
  ) +
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom",
        strip.text      = element_text(face = "bold"))
Figure 3.8b - Violin plots before and after standardisation

Figure 3.8b - Violin plots before and after standardisation


Summary

Summary - Key functions per section
Section Topic Key_Recipe_Steps
3.1 Prerequisites initial_split()
3.2 Target engineering step_log(), step_BoxCox(), step_YeoJohnson()
3.3 Dealing with missingness step_impute_median/mode(), step_impute_knn(), step_impute_bag()
3.4 Feature filtering step_nzv(), findCorrelation()
3.5 Numeric feature engineering step_BoxCox(), step_YeoJohnson(), step_normalize()

How to publish to RPubs

After knitting this document to HTML in RStudio:

  1. Click the Publish button (blue icon, top-right of preview window)
  2. Select RPubs
  3. Sign in at rpubs.com (free account)
  4. Give it a title and click Publish

Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.3.1       scales_1.4.0      forecast_8.24.0   corrplot_0.95    
##  [5] e1071_1.7-17      naniar_1.1.0      patchwork_1.3.2   rsample_1.3.1    
##  [9] AmesHousing_0.0.4 recipes_1.3.1     caret_7.0-1       lattice_0.22-7   
## [13] visdat_0.6.0      ggplot2_4.0.0     dplyr_1.1.4      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.6          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] knitr_1.50           labeling_0.4.3       curl_7.0.0          
## [22] TTR_0.24.4           plyr_1.8.9           RColorBrewer_1.1-3  
## [25] withr_3.0.2          purrr_1.1.0          nnet_7.3-20         
## [28] grid_4.5.1           stats4_4.5.1         sparsevctrs_0.3.4   
## [31] xts_0.14.1           colorspace_2.1-2     future_1.67.0       
## [34] globals_0.18.0       iterators_1.0.14     MASS_7.3-65         
## [37] cli_3.6.5            UpSetR_1.4.0         rmarkdown_2.29      
## [40] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.0 
## [43] reshape2_1.4.5       cachem_1.1.0         proxy_0.4-29        
## [46] stringr_1.5.2        splines_4.5.1        parallel_4.5.1      
## [49] urca_1.3-4           vctrs_0.6.5          hardhat_1.4.2       
## [52] Matrix_1.7-3         jsonlite_2.0.0       tseries_0.10-58     
## [55] listenv_0.9.1        foreach_1.5.2        gower_1.0.2         
## [58] jquerylib_0.1.4      quantmod_0.4.28      glue_1.8.0          
## [61] parallelly_1.45.1    codetools_0.2-20     lubridate_1.9.4     
## [64] stringi_1.8.7        gtable_0.3.6         quadprog_1.5-8      
## [67] lmtest_0.9-40        tibble_3.3.0         pillar_1.11.1       
## [70] furrr_0.3.1          htmltools_0.5.8.1    ipred_0.9-15        
## [73] lava_1.8.1           R6_2.6.1             evaluate_1.0.5      
## [76] fracdiff_1.5-3       bslib_0.9.0          class_7.3-23        
## [79] Rcpp_1.1.0           gridExtra_2.3        nlme_3.1-168        
## [82] prodlim_2025.04.28   xfun_0.52            zoo_1.8-14          
## [85] ModelMetrics_1.2.2.2 pkgconfig_2.0.3

Source: bradleyboehmke.github.io/HOML/engineering.html