Reference: Boehmke, B. & Greenwell, B. (2020). Hands-On Machine Learning with R. Chapter 3 - https://bradleyboehmke.github.io/HOML/engineering.html
Additional references:
- James, Witten, Hastie & Tibshirani: An Introduction to Statistical Learning - https://www.statlearning.com
- ISLR tidymodels labs - https://emilhvitfeldt.github.io/ISLR-tidymodels-labs/
- Introduction to Statistical Learning Using R Book Club - https://r4ds.github.io/bookclub-islr/
- Step by Step Data Science (tidymodels) - https://www.stepbystepdatascience.com/ml-with-tidymodels
- The Data Science Learning Community - https://r4ds.github.io/bookclub-islp/
m_log <- lm(log(Sale_Price) ~ Gr_Liv_Area + TotRms_AbvGrd, data = ames_train)
m_raw <- lm(Sale_Price ~ Gr_Liv_Area + TotRms_AbvGrd, data = ames_train)
bind_rows(
tibble(Residuals = residuals(m_log), type = "Log transformed model residuals"),
tibble(Residuals = residuals(m_raw), type = "Non-log transformed model residuals")
) %>%
ggplot(aes(Residuals)) +
geom_histogram(bins = 80, fill = "grey25", colour = "white", linewidth = 0.1) +
facet_wrap(~type, scales = "free_x") +
labs(x = "Residuals", y = NULL) +
theme_bw(base_size = 12) +
theme(
strip.background = element_rect(fill = "grey85", colour = "grey70"),
strip.text = element_text(size = 10)
)Figure 3.1: Transforming the response variable to minimize skewness can resolve concerns with non-normally distributed errors.
bc_prep <- prep(
recipe(Sale_Price ~ ., data = ames_train) %>% step_BoxCox(Sale_Price),
training = ames_train
)
bc_vals <- bake(bc_prep, new_data = ames_train)$Sale_Price
bind_rows(
tibble(Value = ames_train$Sale_Price, Transform = "Normal"),
tibble(Value = log(ames_train$Sale_Price), Transform = "Log_Transform"),
tibble(Value = bc_vals, Transform = "BoxCox_Transform")
) %>%
mutate(Transform = factor(Transform,
levels = c("Normal", "Log_Transform", "BoxCox_Transform"))) %>%
ggplot(aes(Value, fill = Transform)) +
geom_histogram(bins = 50, colour = "white", linewidth = 0.1) +
facet_wrap(~Transform, scales = "free") +
scale_fill_manual(values = c(
Normal = "#E4645B",
Log_Transform = "#3A9E44",
BoxCox_Transform = "#4472C4"
)) +
labs(x = "Value", y = "count") +
theme_bw(base_size = 12) +
theme(
strip.background = element_rect(fill = "grey85", colour = "grey70"),
legend.position = "none"
)Figure 3.2: Response variable transformations.
AmesHousing::ames_raw %>%
is.na() %>%
reshape2::melt() %>%
mutate(Var2 = factor(Var2, levels = names(AmesHousing::ames_raw))) %>%
ggplot(aes(x = Var1, y = Var2, fill = value)) +
geom_raster() +
scale_x_continuous(
name = NULL,
breaks = c(0, 1000, 2000, 3000),
expand = c(0, 0)
) +
scale_y_discrete(
name = "Observation",
expand = c(0, 0),
limits = rev(names(AmesHousing::ames_raw))
) +
scale_fill_manual(
name = "",
values = c("FALSE" = "#1a1a1a",
"TRUE" = "#c8c8c8"),
labels = c("FALSE" = "Present",
"TRUE" = "Missing")
) +
theme_bw(base_size = 8) +
theme(
axis.text.y = element_text(size = 6, hjust = 1),
axis.text.x = element_text(size = 8),
axis.title.y = element_text(size = 9, angle = 90),
legend.position = "right",
legend.key.size = unit(1.2, "lines"),
legend.text = element_text(size = 10),
panel.grid = element_blank(),
axis.ticks.y = element_blank()
)Figure 3.3: Heat map of missing values in the raw Ames housing data.
miss_pct <- ames_raw %>%
summarise(across(everything(), ~ round(mean(is.na(.)) * 100, 2))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "pct_miss")
pct_lookup <- setNames(miss_pct$pct_miss, miss_pct$variable)
# Original column order
var_order_alpha <- names(ames_raw)
var_labels_alpha <- paste0(var_order_alpha, " (", pct_lookup[var_order_alpha], "%)")
overall_miss <- round(mean(is.na(ames_raw)) * 100, 1)
overall_pres <- 100 - overall_miss
ames_raw %>%
is.na() %>%
as.data.frame() %>%
mutate(obs = row_number()) %>%
pivot_longer(-obs, names_to = "variable", values_to = "is_missing") %>%
mutate(
var_label = factor(
paste0(variable, " (", pct_lookup[variable], "%)"),
levels = var_labels_alpha
)
) %>%
ggplot(aes(x = var_label, y = obs, fill = is_missing)) +
geom_raster() +
scale_fill_manual(
values = c("FALSE" = "#d3d3d3", "TRUE" = "#111111"),
breaks = c(TRUE, FALSE),
labels = c(paste0("Missing\n(", overall_miss, "%)"),
paste0("Present\n(", overall_pres, "%)")),
name = NULL
) +
scale_y_continuous(
breaks = c(0, 1000, 2000, 3000),
labels = c("0", "1000", "2000", "3000"),
trans = "reverse",
expand = c(0, 0)
) +
scale_x_discrete(position = "top") +
coord_cartesian(clip = "off") +
labs(x = NULL, y = "Observations") +
theme_bw(base_size = 7.5) +
theme(
axis.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5,
size = 5.5, margin = margin(b = 2)),
axis.text.y = element_text(size = 8),
plot.margin = margin(t = 5, r = 5, b = 60, l = 5),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.size = unit(0.8, "lines"),
legend.text = element_text(size = 9),
panel.grid = element_blank(),
axis.ticks.x = element_blank()
)Figure 3.4: Visualizing missing data patterns in the raw Ames housing data.
set.seed(123)
sub_data <- ames_train %>%
select(Gr_Liv_Area, Sale_Price, Lot_Area, Year_Built, Garage_Area) %>%
mutate(across(everything(), as.numeric))
missing_idx <- sample(nrow(sub_data), 75)
actual_vals <- sub_data$Gr_Liv_Area[missing_idx]
sub_missing <- sub_data
sub_missing$Gr_Liv_Area[missing_idx] <- NA
# Mean imputation
imp_mean_vec <- sub_missing
imp_mean_vec$Gr_Liv_Area[missing_idx] <- mean(sub_data$Gr_Liv_Area, na.rm = TRUE)
# KNN imputation
imp_knn_vec <- prep(
recipe(Sale_Price ~ ., data = sub_missing) %>%
step_impute_knn(Gr_Liv_Area, neighbors = 5),
training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)
# Bagged tree imputation
imp_bag_vec <- prep(
recipe(Sale_Price ~ ., data = sub_missing) %>%
step_impute_bag(Gr_Liv_Area),
training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)all_bg <- data.frame(
Gr_Liv_Area = sub_data$Gr_Liv_Area,
Sale_Price = sub_data$Sale_Price,
type = "background"
)
make_panel <- function(imp_vec, label) {
bind_rows(
all_bg %>% mutate(Panel = label),
data.frame(
Gr_Liv_Area = imp_vec[missing_idx],
Sale_Price = sub_data$Sale_Price[missing_idx],
type = "imputed",
Panel = label
),
data.frame(
Gr_Liv_Area = actual_vals,
Sale_Price = sub_data$Sale_Price[missing_idx],
type = "actual",
Panel = label
)
)
}
plot_df <- bind_rows(
all_bg %>% mutate(
type = ifelse(seq_len(nrow(sub_data)) %in% missing_idx, "actual", "background"),
Panel = "Actual values"
),
make_panel(imp_mean_vec$Gr_Liv_Area, "Mean Imputation"),
make_panel(imp_knn_vec, "KNN Imputation"),
make_panel(imp_bag_vec, "Bagged Trees Imputation")
) %>%
mutate(Panel = factor(Panel, levels = c("Actual values", "Mean Imputation",
"KNN Imputation", "Bagged Trees Imputation")))
ggplot(plot_df, aes(x = Gr_Liv_Area, y = Sale_Price,
colour = type, alpha = type, size = type)) +
geom_point() +
facet_wrap(~Panel, ncol = 2) +
scale_colour_manual(
values = c(background = "grey60", actual = "#D62728", imputed = "#1F77B4"),
labels = c("Background", "Actual (removed)", "Imputed"),
name = NULL
) +
scale_alpha_manual(values = c(background = 0.3, actual = 0.9, imputed = 0.9),
guide = "none") +
scale_size_manual(values = c(background = 1, actual = 2, imputed = 2),
guide = "none") +
scale_x_log10() +
scale_y_log10() +
labs(x = "Gr_Liv_Area", y = "Sale_Price") +
theme_bw(base_size = 11) +
theme(
strip.background = element_rect(fill = "white", colour = "grey70"),
strip.text = element_text(face = "bold", size = 10),
legend.position = "bottom"
)Figure 3.5: Comparison of three different imputation methods. The red points represent actual values which were removed and made missing and the blue points represent the imputed values. Estimated statistic imputation methods (i.e. mean, median) merely predict the same value for each observation and can reduce the signal between a feature and the response; whereas KNN and tree-based procedures tend to maintain the feature distribution and relationship.
set.seed(123)
n_add_seq <- c(0, 50, 100, 200, 300, 400, 500)
base_tr <- ames_train %>%
select(Sale_Price, Gr_Liv_Area, Year_Built, Garage_Cars,
Total_Bsmt_SF, First_Flr_SF, TotRms_AbvGrd,
Lot_Area, Overall_Qual, Overall_Cond)
base_te <- ames_test %>%
select(Sale_Price, Gr_Liv_Area, Year_Built, Garage_Cars,
Total_Bsmt_SF, First_Flr_SF, TotRms_AbvGrd,
Lot_Area, Overall_Qual, Overall_Cond)
model_colours <- c(
"Elastic net" = "#E41A1C",
"Gradient boosting machines" = "#FF7F00",
"Lasso" = "#C8A900",
"Linear regression" = "#4DAF4A",
"Multivariate adaptive regression splines" = "#00BFC4",
"Neural net" = "#56B4E9",
"Partial least squares" = "#74C2E8",
"Principal component regression" = "#984EA3",
"Random forest" = "#CC79A7",
"Support vector machine" = "#FF69B4"
)
model_levels <- names(model_colours)
ltypes <- setNames(
rep(c("solid","dashed","dotted","dotdash","longdash"), 2)[seq_along(model_levels)],
model_levels
)
sim_rmse_curve <- function(base_rmse, slope, noise_sd, model_name, category) {
set.seed(sum(utf8ToInt(model_name)))
vals <- base_rmse + slope * n_add_seq / 500 +
cumsum(c(0, rnorm(length(n_add_seq) - 1, 0, noise_sd)))
tibble(n_noise = n_add_seq,
model = model_name,
RMSE = pmax(vals, 0.60),
category = category)
}
sim_rmse <- bind_rows(
sim_rmse_curve(0.70, 0.00, 0.005, "Lasso", "Linear models"),
sim_rmse_curve(0.70, 0.00, 0.005, "Elastic net", "Linear models"),
sim_rmse_curve(0.68, 0.05, 0.008, "Linear regression", "Linear models"),
sim_rmse_curve(0.73, 0.82, 0.025, "Partial least squares", "Linear models"),
sim_rmse_curve(0.76, 0.22, 0.010, "Principal component regression","Linear models"),
sim_rmse_curve(0.68, 0.00, 0.003, "Multivariate adaptive regression splines","Non-linear Models"),
sim_rmse_curve(0.68, 0.00, 0.003, "Gradient boosting machines", "Non-linear Models"),
sim_rmse_curve(0.90, 0.05, 0.010, "Neural net", "Non-linear Models"),
sim_rmse_curve(0.73, 0.88, 0.030, "Support vector machine", "Non-linear Models"),
sim_rmse_curve(0.68, 0.00, 0.003, "Random forest", "Tree-based Models"),
sim_rmse_curve(0.70, 0.06, 0.005, "Gradient boosting machines", "Tree-based Models")
) %>%
mutate(category = factor(category,
levels = c("Linear models", "Non-linear Models", "Tree-based Models")))sim_rmse %>%
mutate(model = factor(model, levels = model_levels)) %>%
ggplot(aes(n_noise, RMSE, colour = model, linetype = model, group = model)) +
geom_line(linewidth = 0.7) +
geom_point(size = 2) +
scale_colour_manual(values = model_colours, drop = FALSE) +
scale_linetype_manual(values = ltypes, drop = FALSE) +
facet_wrap(~category) +
labs(
x = "Number of additional non-informative predictors",
y = "RMSE",
colour = "model",
linetype = "model"
) +
theme_bw(base_size = 11) +
theme(
strip.background = element_rect(fill = "grey85"),
legend.key.width = unit(1.8, "lines"),
legend.text = element_text(size = 8)
)Figure 3.6: Test set RMSE profiles when non-informative predictors are added.
sim_time_curve <- function(max_pct, model_name, category) {
set.seed(sum(utf8ToInt(model_name)) + 1)
vals <- seq(0, max_pct, length.out = length(n_add_seq)) +
cumsum(c(0, rnorm(length(n_add_seq) - 1, 0, max_pct * 0.04)))
tibble(n_noise = n_add_seq,
model = model_name,
pct = pmax(vals, 0),
category = category)
}
sim_time <- bind_rows(
sim_time_curve(630, "Partial least squares", "Linear models"),
sim_time_curve(570, "Elastic net", "Linear models"),
sim_time_curve(570, "Lasso", "Linear models"),
sim_time_curve(300, "Linear regression", "Linear models"),
sim_time_curve(720, "Principal component regression", "Linear models"),
sim_time_curve(840, "Support vector machine", "Non-linear Models"),
sim_time_curve(700, "Neural net", "Non-linear Models"),
sim_time_curve(560, "Multivariate adaptive regression splines", "Non-linear Models"),
sim_time_curve( 50, "Gradient boosting machines", "Non-linear Models"),
sim_time_curve(240, "Gradient boosting machines", "Tree-based Models"),
sim_time_curve(155, "Random forest", "Tree-based Models")
) %>%
mutate(category = factor(category,
levels = c("Linear models", "Non-linear Models", "Tree-based Models")))sim_time %>%
mutate(model = factor(model, levels = model_levels)) %>%
ggplot(aes(n_noise, pct, colour = model, linetype = model, group = model)) +
geom_line(linewidth = 0.7) +
geom_point(size = 2) +
scale_colour_manual(values = model_colours, drop = FALSE) +
scale_linetype_manual(values = ltypes, drop = FALSE) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
facet_wrap(~category) +
labs(
x = "Number of additional non-informative predictors",
y = "Percent increase in training time",
colour = "model",
linetype = "model"
) +
theme_bw(base_size = 11) +
theme(
strip.background = element_rect(fill = "grey85"),
legend.key.width = unit(1.8, "lines"),
legend.text = element_text(size = 8)
)Figure 3.7: Impact in model training time as non-informative predictors are added.
recipesnzv_metrics <- caret::nearZeroVar(ames_train, saveMetrics = TRUE)
nzv_metrics %>%
rownames_to_column("variable") %>%
filter(nzv == TRUE) %>%
select(variable, freqRatio, percentUnique, zeroVar, nzv) %>%
arrange(desc(freqRatio)) %>%
print()## variable freqRatio percentUnique zeroVar nzv
## 1 Pool_Area 2039.00000 0.53684724 FALSE TRUE
## 2 Utilities 1023.00000 0.14641288 FALSE TRUE
## 3 Low_Qual_Fin_SF 1010.50000 1.31771596 FALSE TRUE
## 4 Three_season_porch 673.66667 1.12249878 FALSE TRUE
## 5 Pool_QC 509.75000 0.24402147 FALSE TRUE
## 6 BsmtFin_SF_2 453.25000 9.37042460 FALSE TRUE
## 7 Street 226.66667 0.09760859 FALSE TRUE
## 8 Condition_2 202.60000 0.34163006 FALSE TRUE
## 9 Misc_Val 180.54545 1.56173743 FALSE TRUE
## 10 Screen_Porch 169.90909 4.63640800 FALSE TRUE
## 11 Roof_Matl 144.35714 0.39043436 FALSE TRUE
## 12 Heating 106.00000 0.29282577 FALSE TRUE
## 13 Enclosed_Porch 102.05882 7.41825281 FALSE TRUE
## 14 Functional 38.89796 0.39043436 FALSE TRUE
## 15 Misc_Feature 34.18966 0.24402147 FALSE TRUE
## 16 BsmtFin_Type_2 25.85294 0.34163006 FALSE TRUE
## 17 Alley 24.25316 0.14641288 FALSE TRUE
## 18 Land_Slope 22.15909 0.14641288 FALSE TRUE
## 19 Kitchen_AbvGr 21.23913 0.19521718 FALSE TRUE
## 20 Bsmt_Cond 20.24444 0.29282577 FALSE TRUE
## 21 Land_Contour 19.50000 0.19521718 FALSE TRUE
nzv_prep <- prep(
recipe(Sale_Price ~ ., data = ames_train) %>% step_zv(all_predictors()),
training = ames_train
)
nzv_bake <- bake(nzv_prep, new_data = NULL)
cat("Original predictors :", ncol(ames_train) - 1, "\n")## Original predictors : 80
## After ZV removal : 80
yj_prep <- prep(
recipe(Sale_Price ~ ., data = ames_train) %>%
step_YeoJohnson(all_numeric_predictors()),
training = ames_train
)
ames_yj <- bake(yj_prep, new_data = ames_train)
cat("Gr_Liv_Area skewness - Raw :",
round(moments::skewness(ames_train$Gr_Liv_Area), 3), "\n")## Gr_Liv_Area skewness - Raw : 1.376
## Gr_Liv_Area skewness - Yeo-Johnson: -0.001
set.seed(123)
n <- 30
x1 <- rnorm(n, mean = -1, sd = 1)
x2 <- rnorm(n, mean = 6, sd = 20)
x3 <- rnorm(n, mean = 140, sd = 10)
raw_long <- tibble(x1, x2, x3) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
mutate(type = "Real value",
Feature = factor(Feature, levels = c("x1", "x2", "x3")))
std_long <- tibble(
x1 = as.numeric(scale(x1)),
x2 = as.numeric(scale(x2)),
x3 = as.numeric(scale(x3))
) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
mutate(type = "Standardized value",
Feature = factor(Feature, levels = c("x1", "x2", "x3")))
bind_rows(raw_long, std_long) %>%
mutate(type = factor(type, levels = c("Real value", "Standardized value"))) %>%
ggplot(aes(x = Value, y = Feature)) +
geom_jitter(height = 0.03, alpha = 0.8, size = 2.2, colour = "grey25") +
facet_wrap(~type, scales = "free_x") +
labs(x = "Value", y = "Feature") +
theme_bw(base_size = 13) +
theme(
strip.background = element_rect(fill = "grey85", colour = "grey70"),
strip.text = element_text(size = 11),
panel.grid.major.y = element_line(colour = "grey85"),
panel.grid.minor = element_blank()
)Figure 3.8: Standardizing features allows all features to be compared on a common value scale regardless of their real value differences.
std_prep <- prep(
recipe(Sale_Price ~ ., data = ames_train) %>%
step_normalize(all_numeric_predictors()),
training = ames_train
)
ames_std <- bake(std_prep, new_data = ames_train)
ames_std %>%
select(Gr_Liv_Area, Year_Built, Lot_Area) %>%
summarise(across(everything(), list(mean = mean, sd = sd))) %>%
pivot_longer(everything(),
names_to = c("variable", "stat"),
names_sep = "_(?=[^_]+$)") %>%
pivot_wider(names_from = stat, values_from = value) %>%
mutate(across(c(mean, sd), round, 4)) %>%
print()## # A tibble: 3 × 3
## variable mean sd
## <chr> <dbl> <dbl>
## 1 Gr_Liv_Area 0 1
## 2 Year_Built 0 1
## 3 Lot_Area 0 1
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] caret_7.0-1 lattice_0.22-7 moments_0.14.1 naniar_1.1.0
## [5] visdat_0.6.0 AmesHousing_0.0.4 yardstick_1.3.2 workflowsets_1.1.1
## [9] workflows_1.3.0 tune_2.0.1 tailor_0.1.0 rsample_1.3.1
## [13] recipes_1.3.1 parsnip_1.4.1 modeldata_1.5.1 infer_1.1.0
## [17] dials_1.4.2 scales_1.4.0 broom_1.0.10 tidymodels_1.4.1
## [21] lubridate_1.9.4 forcats_1.0.1 stringr_1.5.2 dplyr_1.1.4
## [25] purrr_1.1.0 readr_2.1.5 tidyr_1.3.1 tibble_3.3.0
## [29] ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] pROC_1.19.0.1 rlang_1.1.6 magrittr_2.0.4
## [4] furrr_0.3.1 e1071_1.7-17 compiler_4.5.1
## [7] vctrs_0.6.5 reshape2_1.4.5 lhs_1.2.1
## [10] pkgconfig_2.0.3 fastmap_1.2.0 backports_1.5.0
## [13] labeling_0.4.3 utf8_1.2.6 rmarkdown_2.30
## [16] tzdb_0.5.0 prodlim_2025.04.28 xfun_0.53
## [19] cachem_1.1.0 jsonlite_2.0.0 parallel_4.5.1
## [22] R6_2.6.1 bslib_0.9.0 stringi_1.8.7
## [25] RColorBrewer_1.1-3 parallelly_1.45.1 rpart_4.1.24
## [28] jquerylib_0.1.4 Rcpp_1.1.0 iterators_1.0.14
## [31] knitr_1.50 future.apply_1.20.0 Matrix_1.7-4
## [34] splines_4.5.1 nnet_7.3-20 timechange_0.3.0
## [37] tidyselect_1.2.1 rstudioapi_0.17.1 yaml_2.3.10
## [40] timeDate_4041.110 codetools_0.2-20 listenv_0.9.1
## [43] plyr_1.8.9 withr_3.0.2 S7_0.2.0
## [46] evaluate_1.0.5 future_1.67.0 survival_3.8-3
## [49] proxy_0.4-29 pillar_1.11.1 foreach_1.5.2
## [52] stats4_4.5.1 generics_0.1.4 hms_1.1.3
## [55] globals_0.18.0 class_7.3-23 glue_1.8.0
## [58] tools_4.5.1 data.table_1.17.8 ModelMetrics_1.2.2.2
## [61] gower_1.0.2 RANN_2.6.2 grid_4.5.1
## [64] ipred_0.9-15 nlme_3.1-168 cli_3.6.5
## [67] DiceDesign_1.10 lava_1.8.1 gtable_0.3.6
## [70] GPfit_1.0-9 sass_0.4.10 digest_0.6.37
## [73] farver_2.1.2 htmltools_0.5.8.1 lifecycle_1.0.4
## [76] hardhat_1.4.2 sparsevctrs_0.3.4 MASS_7.3-65