Reference: Boehmke, B. & Greenwell, B. (2020). Hands-On Machine Learning with R.
Chapter 3: Feature & Target Engineering.
Online: https://bradleyboehmke.github.io/HOML/engineering.html
pkgs <- c("dplyr", "ggplot2", "recipes", "rsample",
"AmesHousing", "forecast", "broom",
"purrr", "tidyr", "scales", "tibble")
install.packages(setdiff(pkgs, rownames(installed.packages())))# Core helpers
library(dplyr)
library(ggplot2)
library(tibble)
library(tidyr)
library(purrr)
library(scales)
# ML / feature engineering
library(AmesHousing) # required for make_ames() and ames_raw
library(rsample)
library(recipes)
library(broom)
library(forecast)
ggplot2::theme_set(ggplot2::theme_light())ames <- AmesHousing::make_ames()
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
cat("Training rows:", nrow(ames_train), "\n")## Training rows: 2049
## Test rows: 881
models <- c(
"Non-log transformed model residuals",
"Log transformed model residuals"
)
list(
m1 = lm(Sale_Price ~ Year_Built, data = ames_train),
m2 = lm(log(Sale_Price) ~ Year_Built, data = ames_train)
) %>%
map2_dfr(models, ~ augment(.x) %>% mutate(model = .y)) %>%
ggplot(aes(.resid)) +
geom_histogram(bins = 75) +
facet_wrap(~ model, scales = "free_x") +
ylab(NULL) +
xlab("Residuals") +
ggtitle("Figure 3.1: Residuals before and after log-transforming Sale_Price")# Manual log transformation
transformed_response <- log(ames_train$Sale_Price)
head(transformed_response)## [1] 11.56647 11.38509 11.69525 11.73607 11.11988 11.62625
Why log1p? When values can equal zero,
log(0) = -Inf. log1p() computes
log(1 + x) and is safe for x > -1:
## [1] NaN
## [1] -0.6931472
The Box-Cox family of transformations generalises the log:
\[y(\lambda) = \begin{cases} \frac{Y^\lambda - 1}{\lambda}, & \lambda \neq 0 \\ \log(Y), & \lambda = 0 \end{cases}\]
# Log transformation of training target
train_log_y <- log(ames_train$Sale_Price)
# Box-Cox: estimate lambda from training data, apply to both sets
lambda <- BoxCox.lambda(ames_train$Sale_Price)
cat("Estimated lambda:", round(lambda, 4), "\n")## Estimated lambda: 0.1387
train_bc_y <- BoxCox(ames_train$Sale_Price, lambda)
# Compare distributions
levs <- c("Normal", "Log_Transform", "BoxCox_Transform")
data.frame(
Normal = ames_train$Sale_Price,
Log_Transform = train_log_y,
BoxCox_Transform = train_bc_y
) %>%
gather(Transform, Value) %>%
mutate(Transform = factor(Transform, levels = levs)) %>%
ggplot(aes(Value, fill = Transform)) +
geom_histogram(show.legend = FALSE, bins = 40) +
facet_wrap(~ Transform, scales = "free_x") +
ggtitle("Figure 3.2: Normal, Log, and Box-Cox transformed Sale_Price")ames_raw <- AmesHousing::ames_raw
cat("Total missing values in raw Ames data:", sum(is.na(ames_raw)), "\n")## Total missing values in raw Ames data: 13997
Figure 3.3 – Missing-value heatmap (reproduced
without visdat):
# Build a tidy missingness matrix
miss_df <- ames_raw %>%
summarise(across(everything(), ~ mean(is.na(.)))) %>%
pivot_longer(everything(),
names_to = "Variable",
values_to = "PctMissing") %>%
filter(PctMissing > 0) %>%
arrange(desc(PctMissing))
miss_df## # A tibble: 27 × 2
## Variable PctMissing
## <chr> <dbl>
## 1 Pool QC 0.996
## 2 Misc Feature 0.964
## 3 Alley 0.932
## 4 Fence 0.805
## 5 Fireplace Qu 0.485
## 6 Lot Frontage 0.167
## 7 Garage Yr Blt 0.0543
## 8 Garage Finish 0.0543
## 9 Garage Qual 0.0543
## 10 Garage Cond 0.0543
## # ℹ 17 more rows
miss_df %>%
ggplot(aes(x = reorder(Variable, PctMissing), y = PctMissing)) +
geom_col(fill = "steelblue") +
coord_flip() +
scale_y_continuous(labels = percent_format()) +
xlab(NULL) +
ylab("Proportion Missing") +
ggtitle("Figure 3.3: Variables with missing values (Ames raw data)")# Base-R near-zero variance detection (no caret needed)
# A feature is NZV if: freqRatio > 19 OR percentUnique < 10
nzv_check <- function(x) {
if (!is.numeric(x)) return(FALSE)
x <- x[!is.na(x)]
if (length(x) == 0) return(TRUE)
tbl <- sort(table(x), decreasing = TRUE)
freq_ratio <- if (length(tbl) > 1) tbl[[1]] / tbl[[2]] else Inf
pct_unique <- length(tbl) / length(x) * 100
freq_ratio > 19 | pct_unique < 10
}
nzv_features <- ames_train %>%
select(where(is.numeric)) %>%
summarise(across(everything(), nzv_check)) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "nzv") %>%
filter(nzv == TRUE)
cat("Near-zero variance features found:", nrow(nzv_features), "\n")## Near-zero variance features found: 26
## # A tibble: 26 × 2
## Feature nzv
## <chr> <lgl>
## 1 Lot_Frontage TRUE
## 2 Year_Built TRUE
## 3 Year_Remod_Add TRUE
## 4 Mas_Vnr_Area TRUE
## 5 BsmtFin_SF_1 TRUE
## 6 BsmtFin_SF_2 TRUE
## 7 Second_Flr_SF TRUE
## 8 Low_Qual_Fin_SF TRUE
## 9 Bsmt_Full_Bath TRUE
## 10 Bsmt_Half_Bath TRUE
## # ℹ 16 more rows
ames_recipe_nzv <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_predictors())
ames_recipe_nzvNumeric predictors can be skewed just like the target. We measure skewness and flag features that exceed a common threshold of |skewness| > 1.
# Base-R skewness: third standardised moment
skewness_br <- function(x) {
x <- x[!is.na(x)]
n <- length(x)
m <- mean(x)
s <- sd(x)
if (s == 0 || n < 3) return(NA_real_)
(sum((x - m)^3) / n) / s^3
}
skew_vals <- ames_train %>%
select(where(is.numeric), -Sale_Price) %>%
summarise(across(everything(), skewness_br)) %>%
pivot_longer(everything(),
names_to = "Feature",
values_to = "Skewness") %>%
arrange(desc(abs(Skewness)))
cat("Features with |skewness| > 1:", sum(abs(skew_vals$Skewness) > 1, na.rm = TRUE), "\n")## Features with |skewness| > 1: 16
## # A tibble: 10 × 2
## Feature Skewness
## <chr> <dbl>
## 1 Misc_Val 20.7
## 2 Pool_Area 16.7
## 3 Lot_Area 13.0
## 4 Three_season_porch 11.3
## 5 Low_Qual_Fin_SF 10.6
## 6 Kitchen_AbvGr 4.45
## 7 Enclosed_Porch 4.40
## 8 BsmtFin_SF_2 4.16
## 9 Bsmt_Half_Bath 4.00
## 10 Screen_Porch 3.75
skew_vals %>%
ggplot(aes(Skewness)) +
geom_histogram(bins = 30, fill = "steelblue", colour = "white") +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", colour = "red") +
xlab("Skewness") + ylab("Count") +
ggtitle("Distribution of skewness across numeric predictors",
subtitle = "Red dashed lines at ±1 — common threshold for transformation")Algorithms that rely on distance or gradients (KNN, SVM, neural networks, regularised regression) require features to be on a common scale.
ames_recipe_norm <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_normalize(all_numeric_predictors()) # centers then scales
ames_recipe_normBefore vs. after standardisation —
Gr_Liv_Area:
ames_prepped <- prep(ames_recipe_norm, training = ames_train)
ames_baked <- bake(ames_prepped, new_data = ames_train)
ggplot(ames_train, aes(Gr_Liv_Area)) +
geom_histogram(bins = 50, fill = "coral", colour = "white") +
ggtitle("Before normalisation: Gr_Liv_Area")ggplot(ames_baked, aes(Gr_Liv_Area)) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
ggtitle("After normalisation: Gr_Liv_Area (mean=0, sd=1)")Summary statistics confirm the transformation:
## --- Before ---
cat("Mean:", round(mean(ames_train$Gr_Liv_Area), 2),
" SD:", round(sd(ames_train$Gr_Liv_Area), 2), "\n\n")## Mean: 1504.41 SD: 518.37
## --- After ---
cat("Mean:", round(mean(ames_baked$Gr_Liv_Area), 6),
" SD:", round(sd(ames_baked$Gr_Liv_Area), 4), "\n")## Mean: 0 SD: 1
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forecast_9.0.1 broom_1.0.12 recipes_1.3.1 rsample_1.3.2
## [5] AmesHousing_0.0.4 scales_1.4.0 purrr_1.2.1 tidyr_1.3.2
## [9] tibble_3.3.1 ggplot2_4.0.2 dplyr_1.2.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.10.0
## [4] lattice_0.22-7 vctrs_0.7.1 tools_4.5.2
## [7] generics_0.1.4 parallel_4.5.2 pkgconfig_2.0.3
## [10] Matrix_1.7-4 data.table_1.18.2.1 RColorBrewer_1.1-3
## [13] S7_0.2.1 lifecycle_1.0.5 compiler_4.5.2
## [16] farver_2.1.2 codetools_0.2-20 htmltools_0.5.9
## [19] class_7.3-23 sass_0.4.10 yaml_2.3.12
## [22] prodlim_2025.04.28 pillar_1.11.1 furrr_0.3.1
## [25] jquerylib_0.1.4 MASS_7.3-65 cachem_1.1.0
## [28] gower_1.0.2 rpart_4.1.24 nlme_3.1-168
## [31] parallelly_1.46.1 lava_1.8.2 fracdiff_1.5-3
## [34] tidyselect_1.2.1 digest_0.6.39 future_1.69.0
## [37] listenv_0.10.0 labeling_0.4.3 splines_4.5.2
## [40] fastmap_1.2.0 grid_4.5.2 colorspace_2.1-2
## [43] cli_3.6.5 magrittr_2.0.4 utf8_1.2.6
## [46] survival_3.8-3 future.apply_1.20.2 withr_3.0.2
## [49] backports_1.5.0 lubridate_1.9.5 timechange_0.4.0
## [52] rmarkdown_2.30 globals_0.19.0 nnet_7.3-20
## [55] timeDate_4052.112 zoo_1.8-15 urca_1.3-4
## [58] evaluate_1.0.5 knitr_1.51 hardhat_1.4.2
## [61] rlang_1.1.7 Rcpp_1.1.1 glue_1.8.0
## [64] ipred_0.9-15 jsonlite_2.0.0 R6_2.6.1