Source: Hands-On Machine Learning with R, Chapter 3.


3.1 Prerequisites

# Helper packages
library(dplyr)    # for data manipulation
library(tibble)   # for rownames_to_column
library(ggplot2)  # for awesome graphics
library(visdat)   # for additional visualizations

# Feature engineering packages
library(caret)    # for various ML tasks
library(recipes)  # for feature engineering tasks

# Load and split the Ames housing data using stratified sampling
library(AmesHousing)
library(rsample)

ames <- AmesHousing::make_ames()

set.seed(123)
split     <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test  <- testing(split)

3.2 Target Engineering

Many machine learning models assume the target (response) variable is normally distributed. When the target is right-skewed, a log (or Box-Cox) transformation can improve model fit.

3.2.1 Log transformation

# 1. Untransformed Sale_Price
p1 <- ggplot(ames_train, aes(Sale_Price)) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  ggtitle("Sale Price (original)") +
  scale_x_continuous(labels = scales::dollar)

# 2. Log-transformed
p2 <- ggplot(ames_train, aes(Sale_Price)) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  scale_x_log10(labels = scales::dollar) +
  ggtitle("Sale Price (log₁₀ scale)")

gridExtra::grid.arrange(p1, p2, ncol = 2)
Distribution of Sale Price before and after log transformation

Distribution of Sale Price before and after log transformation

# Incorporating log transformation into a recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(Sale_Price, base = 10)

3.2.2 Box-Cox transformation

# Box-Cox can find the optimal lambda automatically
lambda <- forecast::BoxCox.lambda(ames_train$Sale_Price)
cat("Optimal Box-Cox lambda:", round(lambda, 3), "\n")
## Optimal Box-Cox lambda: 0.139
# Apply in a recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_BoxCox(Sale_Price)

3.3 Dealing with Missingness

3.3.1 Visualising missing values

ames_train %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill = value)) +
  geom_raster() +
  scale_fill_manual(values = c("grey80", "steelblue"),
                    labels = c("Present", "Missing"),
                    name   = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 6),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  labs(x = "Feature", y = "Observations",
       title = "Missing data pattern in ames_train")
Missing value pattern in ames_train (first 100 rows)

Missing value pattern in ames_train (first 100 rows)

# Count NAs per feature
ames_train %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "feature",
                      values_to = "n_missing") %>%
  filter(n_missing > 0) %>%
  arrange(desc(n_missing))
## # A tibble: 0 × 2
## # ℹ 2 variables: feature <chr>, n_missing <int>

3.3.2 Imputation

3.3.2.1 Estimated statistic (mean / median)

ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_mean(all_numeric_predictors())

ames_recipe

3.3.2.2 K-Nearest Neighbor imputation

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)

3.3.2.3 Tree-based (bagged) imputation

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_bag(all_predictors())

3.4 Feature Filtering

Near-zero variance features add noise and slow training. We can remove them automatically.

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors())
# Identify near-zero variance features manually
nzv_info <- nearZeroVar(ames_train, saveMetrics = TRUE)

nzv_info %>%
  rownames_to_column("feature") %>%
  filter(nzv == TRUE) %>%
  arrange(desc(freqRatio)) %>%
  select(feature, freqRatio, percentUnique, zeroVar, nzv) %>%
  head(20)
##               feature  freqRatio percentUnique zeroVar  nzv
## 1           Pool_Area 2039.00000    0.53684724   FALSE TRUE
## 2           Utilities 1023.00000    0.14641288   FALSE TRUE
## 3     Low_Qual_Fin_SF 1010.50000    1.31771596   FALSE TRUE
## 4  Three_season_porch  673.66667    1.12249878   FALSE TRUE
## 5             Pool_QC  509.75000    0.24402147   FALSE TRUE
## 6        BsmtFin_SF_2  453.25000    9.37042460   FALSE TRUE
## 7              Street  226.66667    0.09760859   FALSE TRUE
## 8         Condition_2  202.60000    0.34163006   FALSE TRUE
## 9            Misc_Val  180.54545    1.56173743   FALSE TRUE
## 10       Screen_Porch  169.90909    4.63640800   FALSE TRUE
## 11          Roof_Matl  144.35714    0.39043436   FALSE TRUE
## 12            Heating  106.00000    0.29282577   FALSE TRUE
## 13     Enclosed_Porch  102.05882    7.41825281   FALSE TRUE
## 14         Functional   38.89796    0.39043436   FALSE TRUE
## 15       Misc_Feature   34.18966    0.24402147   FALSE TRUE
## 16     BsmtFin_Type_2   25.85294    0.34163006   FALSE TRUE
## 17              Alley   24.25316    0.14641288   FALSE TRUE
## 18         Land_Slope   22.15909    0.14641288   FALSE TRUE
## 19      Kitchen_AbvGr   21.23913    0.19521718   FALSE TRUE
## 20          Bsmt_Cond   20.24444    0.29282577   FALSE TRUE
nzv_info %>%
  rownames_to_column("feature") %>%
  filter(nzv == TRUE) %>%
  ggplot(aes(reorder(feature, freqRatio), freqRatio)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(x = NULL, y = "Frequency Ratio",
       title = "Near-Zero Variance Features in ames_train")
Features flagged as near-zero variance

Features flagged as near-zero variance


3.5 Numeric Feature Engineering

3.5.1 Skewness

Right-skewed numeric predictors can hurt model performance. We can detect and correct them.

library(e1071)

skew_values <- ames_train %>%
  select(where(is.numeric)) %>%
  summarise(across(everything(), ~skewness(., na.rm = TRUE))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "feature",
                      values_to = "skewness") %>%
  arrange(desc(abs(skewness)))

head(skew_values, 15)
## # A tibble: 15 × 2
##    feature            skewness
##    <chr>                 <dbl>
##  1 Misc_Val              20.7 
##  2 Pool_Area             16.7 
##  3 Lot_Area              13.0 
##  4 Three_season_porch    11.3 
##  5 Low_Qual_Fin_SF       10.6 
##  6 Kitchen_AbvGr          4.45
##  7 Enclosed_Porch         4.40
##  8 BsmtFin_SF_2           4.16
##  9 Bsmt_Half_Bath         4.00
## 10 Screen_Porch           3.75
## 11 Open_Porch_SF          2.71
## 12 Mas_Vnr_Area           2.63
## 13 Wood_Deck_SF           1.95
## 14 First_Flr_SF           1.71
## 15 Sale_Price             1.67
p1 <- ggplot(ames_train, aes(Gr_Liv_Area)) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  ggtitle(paste0("Gr_Liv_Area  (skewness = ",
                 round(skewness(ames_train$Gr_Liv_Area), 2), ")"))

p2 <- ggplot(ames_train, aes(log(Gr_Liv_Area))) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  ggtitle(paste0("log(Gr_Liv_Area)  (skewness = ",
                 round(skewness(log(ames_train$Gr_Liv_Area)), 2), ")"))

gridExtra::grid.arrange(p1, p2, ncol = 2)
Distribution of Gr_Liv_Area before and after log transformation

Distribution of Gr_Liv_Area before and after log transformation

# Apply Yeo-Johnson (handles zeroes & negatives) in a recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors())

3.5.2 Standardisation

Many algorithms (regularised regression, KNN, SVMs) are sensitive to feature scale. Centering and scaling brings all features to a common scale (mean = 0, sd = 1).

ames_train %>%
  select(Year_Built, Gr_Liv_Area) %>%
  tidyr::pivot_longer(everything()) %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
  facet_wrap(~name, scales = "free") +
  ggtitle("Before standardisation")
Raw distributions of two numeric features

Raw distributions of two numeric features

ames_train %>%
  select(Year_Built, Gr_Liv_Area) %>%
  mutate(across(everything(), scale)) %>%
  tidyr::pivot_longer(everything()) %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 40, fill = "darkorange", colour = "white") +
  facet_wrap(~name, scales = "free") +
  ggtitle("After standardisation (mean=0, sd=1)")
Standardised distributions

Standardised distributions

# Standardise inside a recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())

Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] e1071_1.7-17      rsample_1.3.2     AmesHousing_0.0.4 recipes_1.3.1    
##  [5] caret_7.0-1       lattice_0.22-7    visdat_0.6.0      ggplot2_4.0.2    
##  [9] tibble_3.3.1      dplyr_1.2.0      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4051.111    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.5      survival_3.8-3       magrittr_2.0.3      
## [13] compiler_4.5.1       rlang_1.1.7          sass_0.4.10         
## [16] tools_4.5.1          utf8_1.2.6           yaml_2.3.10         
## [19] data.table_1.17.8    knitr_1.50           labeling_0.4.3      
## [22] plyr_1.8.9           RColorBrewer_1.1-3   withr_3.0.2         
## [25] purrr_1.1.0          nnet_7.3-20          grid_4.5.1          
## [28] stats4_4.5.1         colorspace_2.1-2     future_1.67.0       
## [31] globals_0.18.0       scales_1.4.0         iterators_1.0.14    
## [34] MASS_7.3-65          cli_3.6.5            rmarkdown_2.29      
## [37] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.0 
## [40] reshape2_1.4.5       proxy_0.4-29         cachem_1.1.0        
## [43] stringr_1.5.2        splines_4.5.1        forecast_9.0.1      
## [46] parallel_4.5.1       urca_1.3-4           vctrs_0.7.1         
## [49] hardhat_1.4.2        Matrix_1.7-3         jsonlite_2.0.0      
## [52] listenv_0.9.1        foreach_1.5.2        gower_1.0.2         
## [55] jquerylib_0.1.4      tidyr_1.3.1          glue_1.8.0          
## [58] parallelly_1.45.1    codetools_0.2-20     lubridate_1.9.4     
## [61] stringi_1.8.7        gtable_0.3.6         pillar_1.11.0       
## [64] furrr_0.3.1          htmltools_0.5.8.1    ipred_0.9-15        
## [67] lava_1.8.2           R6_2.6.1             evaluate_1.0.5      
## [70] fracdiff_1.5-3       bslib_0.9.0          class_7.3-23        
## [73] Rcpp_1.1.0           gridExtra_2.3        nlme_3.1-168        
## [76] prodlim_2025.04.28   xfun_0.53            zoo_1.8-14          
## [79] ModelMetrics_1.2.2.2 pkgconfig_2.0.3