Source: Hands-On Machine Learning with R, Chapter 3.
# Helper packages
library(dplyr) # for data manipulation
library(tibble) # for rownames_to_column
library(ggplot2) # for awesome graphics
library(visdat) # for additional visualizations
# Feature engineering packages
library(caret) # for various ML tasks
library(recipes) # for feature engineering tasks
# Load and split the Ames housing data using stratified sampling
library(AmesHousing)
library(rsample)
ames <- AmesHousing::make_ames()
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)Many machine learning models assume the target (response) variable is normally distributed. When the target is right-skewed, a log (or Box-Cox) transformation can improve model fit.
# 1. Untransformed Sale_Price
p1 <- ggplot(ames_train, aes(Sale_Price)) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
ggtitle("Sale Price (original)") +
scale_x_continuous(labels = scales::dollar)
# 2. Log-transformed
p2 <- ggplot(ames_train, aes(Sale_Price)) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
scale_x_log10(labels = scales::dollar) +
ggtitle("Sale Price (log₁₀ scale)")
gridExtra::grid.arrange(p1, p2, ncol = 2)Distribution of Sale Price before and after log transformation
ames_train %>%
is.na() %>%
reshape2::melt() %>%
ggplot(aes(Var2, Var1, fill = value)) +
geom_raster() +
scale_fill_manual(values = c("grey80", "steelblue"),
labels = c("Present", "Missing"),
name = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 6),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(x = "Feature", y = "Observations",
title = "Missing data pattern in ames_train")Missing value pattern in ames_train (first 100 rows)
# Count NAs per feature
ames_train %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
tidyr::pivot_longer(everything(),
names_to = "feature",
values_to = "n_missing") %>%
filter(n_missing > 0) %>%
arrange(desc(n_missing))## # A tibble: 0 × 2
## # ℹ 2 variables: feature <chr>, n_missing <int>
Near-zero variance features add noise and slow training. We can remove them automatically.
# Identify near-zero variance features manually
nzv_info <- nearZeroVar(ames_train, saveMetrics = TRUE)
nzv_info %>%
rownames_to_column("feature") %>%
filter(nzv == TRUE) %>%
arrange(desc(freqRatio)) %>%
select(feature, freqRatio, percentUnique, zeroVar, nzv) %>%
head(20)## feature freqRatio percentUnique zeroVar nzv
## 1 Pool_Area 2039.00000 0.53684724 FALSE TRUE
## 2 Utilities 1023.00000 0.14641288 FALSE TRUE
## 3 Low_Qual_Fin_SF 1010.50000 1.31771596 FALSE TRUE
## 4 Three_season_porch 673.66667 1.12249878 FALSE TRUE
## 5 Pool_QC 509.75000 0.24402147 FALSE TRUE
## 6 BsmtFin_SF_2 453.25000 9.37042460 FALSE TRUE
## 7 Street 226.66667 0.09760859 FALSE TRUE
## 8 Condition_2 202.60000 0.34163006 FALSE TRUE
## 9 Misc_Val 180.54545 1.56173743 FALSE TRUE
## 10 Screen_Porch 169.90909 4.63640800 FALSE TRUE
## 11 Roof_Matl 144.35714 0.39043436 FALSE TRUE
## 12 Heating 106.00000 0.29282577 FALSE TRUE
## 13 Enclosed_Porch 102.05882 7.41825281 FALSE TRUE
## 14 Functional 38.89796 0.39043436 FALSE TRUE
## 15 Misc_Feature 34.18966 0.24402147 FALSE TRUE
## 16 BsmtFin_Type_2 25.85294 0.34163006 FALSE TRUE
## 17 Alley 24.25316 0.14641288 FALSE TRUE
## 18 Land_Slope 22.15909 0.14641288 FALSE TRUE
## 19 Kitchen_AbvGr 21.23913 0.19521718 FALSE TRUE
## 20 Bsmt_Cond 20.24444 0.29282577 FALSE TRUE
nzv_info %>%
rownames_to_column("feature") %>%
filter(nzv == TRUE) %>%
ggplot(aes(reorder(feature, freqRatio), freqRatio)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(x = NULL, y = "Frequency Ratio",
title = "Near-Zero Variance Features in ames_train")Features flagged as near-zero variance
Right-skewed numeric predictors can hurt model performance. We can detect and correct them.
library(e1071)
skew_values <- ames_train %>%
select(where(is.numeric)) %>%
summarise(across(everything(), ~skewness(., na.rm = TRUE))) %>%
tidyr::pivot_longer(everything(),
names_to = "feature",
values_to = "skewness") %>%
arrange(desc(abs(skewness)))
head(skew_values, 15)## # A tibble: 15 × 2
## feature skewness
## <chr> <dbl>
## 1 Misc_Val 20.7
## 2 Pool_Area 16.7
## 3 Lot_Area 13.0
## 4 Three_season_porch 11.3
## 5 Low_Qual_Fin_SF 10.6
## 6 Kitchen_AbvGr 4.45
## 7 Enclosed_Porch 4.40
## 8 BsmtFin_SF_2 4.16
## 9 Bsmt_Half_Bath 4.00
## 10 Screen_Porch 3.75
## 11 Open_Porch_SF 2.71
## 12 Mas_Vnr_Area 2.63
## 13 Wood_Deck_SF 1.95
## 14 First_Flr_SF 1.71
## 15 Sale_Price 1.67
p1 <- ggplot(ames_train, aes(Gr_Liv_Area)) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
ggtitle(paste0("Gr_Liv_Area (skewness = ",
round(skewness(ames_train$Gr_Liv_Area), 2), ")"))
p2 <- ggplot(ames_train, aes(log(Gr_Liv_Area))) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
ggtitle(paste0("log(Gr_Liv_Area) (skewness = ",
round(skewness(log(ames_train$Gr_Liv_Area)), 2), ")"))
gridExtra::grid.arrange(p1, p2, ncol = 2)Distribution of Gr_Liv_Area before and after log transformation
Many algorithms (regularised regression, KNN, SVMs) are sensitive to feature scale. Centering and scaling brings all features to a common scale (mean = 0, sd = 1).
ames_train %>%
select(Year_Built, Gr_Liv_Area) %>%
tidyr::pivot_longer(everything()) %>%
ggplot(aes(value)) +
geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
facet_wrap(~name, scales = "free") +
ggtitle("Before standardisation")Raw distributions of two numeric features
ames_train %>%
select(Year_Built, Gr_Liv_Area) %>%
mutate(across(everything(), scale)) %>%
tidyr::pivot_longer(everything()) %>%
ggplot(aes(value)) +
geom_histogram(bins = 40, fill = "darkorange", colour = "white") +
facet_wrap(~name, scales = "free") +
ggtitle("After standardisation (mean=0, sd=1)")Standardised distributions
# Standardise inside a recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] e1071_1.7-17 rsample_1.3.2 AmesHousing_0.0.4 recipes_1.3.1
## [5] caret_7.0-1 lattice_0.22-7 visdat_0.6.0 ggplot2_4.0.2
## [9] tibble_3.3.1 dplyr_1.2.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4051.111 farver_2.1.2
## [4] S7_0.2.0 fastmap_1.2.0 pROC_1.19.0.1
## [7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
## [10] lifecycle_1.0.5 survival_3.8-3 magrittr_2.0.3
## [13] compiler_4.5.1 rlang_1.1.7 sass_0.4.10
## [16] tools_4.5.1 utf8_1.2.6 yaml_2.3.10
## [19] data.table_1.17.8 knitr_1.50 labeling_0.4.3
## [22] plyr_1.8.9 RColorBrewer_1.1-3 withr_3.0.2
## [25] purrr_1.1.0 nnet_7.3-20 grid_4.5.1
## [28] stats4_4.5.1 colorspace_2.1-2 future_1.67.0
## [31] globals_0.18.0 scales_1.4.0 iterators_1.0.14
## [34] MASS_7.3-65 cli_3.6.5 rmarkdown_2.29
## [37] generics_0.1.4 rstudioapi_0.17.1 future.apply_1.20.0
## [40] reshape2_1.4.5 proxy_0.4-29 cachem_1.1.0
## [43] stringr_1.5.2 splines_4.5.1 forecast_9.0.1
## [46] parallel_4.5.1 urca_1.3-4 vctrs_0.7.1
## [49] hardhat_1.4.2 Matrix_1.7-3 jsonlite_2.0.0
## [52] listenv_0.9.1 foreach_1.5.2 gower_1.0.2
## [55] jquerylib_0.1.4 tidyr_1.3.1 glue_1.8.0
## [58] parallelly_1.45.1 codetools_0.2-20 lubridate_1.9.4
## [61] stringi_1.8.7 gtable_0.3.6 pillar_1.11.0
## [64] furrr_0.3.1 htmltools_0.5.8.1 ipred_0.9-15
## [67] lava_1.8.2 R6_2.6.1 evaluate_1.0.5
## [70] fracdiff_1.5-3 bslib_0.9.0 class_7.3-23
## [73] Rcpp_1.1.0 gridExtra_2.3 nlme_3.1-168
## [76] prodlim_2025.04.28 xfun_0.53 zoo_1.8-14
## [79] ModelMetrics_1.2.2.2 pkgconfig_2.0.3