Reproducing outputs from Sections 3.1–3.5 of Hands-On Machine Learning with R by Bradley Boehmke & Brandon Greenwell.
# Packages used in this chapter
cat("recipes version :", as.character(packageVersion("recipes")), "\n")## recipes version : 1.3.1
## rsample version : 1.3.1
## AmesHousing : 0.0.4
# Confirm the train/test split
cat("\nTraining rows:", nrow(ames_train), "| columns:", ncol(ames_train))##
## Training rows: 2049 | columns: 81
##
## Test rows : 881 | columns: 81
## [1] 105500 88000 120000 125000 67500 112000
A right-skewed response can hurt model performance. Two popular fixes are a log transformation and a Box-Cox / Yeo-Johnson transformation.
ggplot(ames_train, aes(x = Sale_Price)) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
scale_x_continuous(labels = dollar_format()) +
labs(title = "Sale Price – original scale",
x = "Sale Price ($)", y = "Count")ggplot(ames_train, aes(x = log(Sale_Price))) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
labs(title = "Sale Price – log-transformed",
x = "log(Sale Price)", y = "Count")## Total NAs in raw Ames data: 13997
miss_mat <- is.na(ames_raw)
miss_df <- data.frame(
row = rep(seq_len(nrow(miss_mat)), ncol(miss_mat)),
col = rep(colnames(miss_mat), each = nrow(miss_mat)),
missing = as.vector(miss_mat)
)
ggplot(miss_df, aes(x = col, y = row, fill = missing)) +
geom_raster() +
coord_flip() +
scale_fill_manual(
name = "",
values = c("FALSE" = "grey80", "TRUE" = "#E74C3C"),
labels = c("Present", "Missing")
) +
scale_y_continuous(NULL, expand = c(0, 0)) +
labs(title = "Missing values – raw Ames housing data", x = NULL) +
theme_minimal(base_size = 8) +
theme(axis.text.y = element_text(size = 6))Properties with no garage have NA for all garage-related
fields — the missingness is informative, not random.
ames_raw %>%
filter(is.na(`Garage Type`)) %>%
select(`Garage Type`, `Garage Cars`, `Garage Area`) %>%
head(10)## # A tibble: 10 × 3
## `Garage Type` `Garage Cars` `Garage Area`
## <chr> <int> <int>
## 1 <NA> 0 0
## 2 <NA> 0 0
## 3 <NA> 0 0
## 4 <NA> 0 0
## 5 <NA> 0 0
## 6 <NA> 0 0
## 7 <NA> 0 0
## 8 <NA> 0 0
## 9 <NA> 0 0
## 10 <NA> 0 0
# Handles both old (<0.1.16) and new (>=0.1.16) recipes naming
if ("step_impute_mean" %in% getNamespaceExports("recipes")) {
rec_mean <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_mean(all_numeric_predictors())
} else {
rec_mean <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_meanimpute(all_numeric())
}
rec_meanFeatures with near-zero variance carry almost no
information and can destabilise some models. step_nzv()
removes them automatically.
nzv_prep <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_predictors()) %>%
prep(training = ames_train)
flagged <- tidy(nzv_prep, number = 1)
cat("Variables flagged as near-zero variance:", nrow(flagged), "\n\n")## Variables flagged as near-zero variance: 21
## # A tibble: 21 × 2
## terms id
## <chr> <chr>
## 1 Street nzv_w5w6w
## 2 Alley nzv_w5w6w
## 3 Land_Contour nzv_w5w6w
## 4 Utilities nzv_w5w6w
## 5 Land_Slope nzv_w5w6w
## 6 Condition_2 nzv_w5w6w
## 7 Roof_Matl nzv_w5w6w
## 8 Bsmt_Cond nzv_w5w6w
## 9 BsmtFin_Type_2 nzv_w5w6w
## 10 BsmtFin_SF_2 nzv_w5w6w
## # ℹ 11 more rows
Right-skewed predictors can degrade algorithms that assume approximate normality. We compare distributions before and after a Yeo-Johnson transform.
ames_train %>%
select(Lot_Area, Gr_Liv_Area, TotRms_AbvGrd) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
facet_wrap(~variable, scales = "free") +
labs(title = "Numeric predictors – BEFORE transformation",
x = NULL, y = "Count")rec_yj <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_YeoJohnson(all_numeric_predictors())
baked_yj <- rec_yj %>%
prep(training = ames_train) %>%
bake(new_data = ames_train)
baked_yj %>%
select(Lot_Area, Gr_Liv_Area, TotRms_AbvGrd) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 40, fill = "darkorange", colour = "white") +
facet_wrap(~variable, scales = "free") +
labs(title = "Numeric predictors – AFTER Yeo-Johnson transformation",
x = NULL, y = "Count")Distance-based and regularised models need features on a
common scale. step_normalize() subtracts
the mean and divides by the SD (mean = 0, SD = 1).
rec_std <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_normalize(all_numeric_predictors())
rec_stdbaked_std <- rec_std %>%
prep(training = ames_train) %>%
bake(new_data = ames_train)
p1 <- ggplot(ames_train, aes(x = Gr_Liv_Area)) +
geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
labs(title = "Original", x = "Gr_Liv_Area", y = "Count")
p2 <- ggplot(baked_std, aes(x = Gr_Liv_Area)) +
geom_histogram(bins = 40, fill = "darkorange", colour = "white") +
labs(title = "Standardised (mean=0, SD=1)", x = "Gr_Liv_Area", y = "Count")
grid.arrange(p1, p2, ncol = 2,
top = "Gr_Liv_Area: before vs. after standardisation")baked_std %>%
summarise(
mean_GrLivArea = round(mean(Gr_Liv_Area), 6),
sd_GrLivArea = round(sd(Gr_Liv_Area), 6)
)## # A tibble: 1 × 2
## mean_GrLivArea sd_GrLivArea
## <dbl> <dbl>
## 1 0 1
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Asia/Taipei
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 gridExtra_2.3 tidyr_1.3.1 rsample_1.3.1
## [5] AmesHousing_0.0.4 recipes_1.3.1 visdat_0.6.0 ggplot2_4.0.1
## [9] dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.6 sass_0.4.10 future_1.68.0
## [4] generics_0.1.4 class_7.3-23 lattice_0.22-7
## [7] listenv_0.10.0 digest_0.6.37 magrittr_2.0.3
## [10] RColorBrewer_1.1-3 timechange_0.3.0 evaluate_1.0.5
## [13] grid_4.5.2 fastmap_1.2.0 jsonlite_2.0.0
## [16] Matrix_1.7-4 nnet_7.3-20 survival_3.8-3
## [19] purrr_1.1.0 codetools_0.2-20 jquerylib_0.1.4
## [22] lava_1.8.2 cli_3.6.5 rlang_1.1.6
## [25] hardhat_1.4.2 parallelly_1.46.0 future.apply_1.20.1
## [28] splines_4.5.2 withr_3.0.2 cachem_1.1.0
## [31] yaml_2.3.10 prodlim_2025.04.28 tools_4.5.2
## [34] parallel_4.5.2 globals_0.18.0 vctrs_0.6.5
## [37] R6_2.6.1 rpart_4.1.24 lifecycle_1.0.4
## [40] lubridate_1.9.4 MASS_7.3-65 furrr_0.3.1
## [43] pkgconfig_2.0.3 gtable_0.3.6 pillar_1.11.1
## [46] bslib_0.9.0 glue_1.8.0 data.table_1.17.8
## [49] Rcpp_1.1.0 xfun_0.56 tibble_3.3.0
## [52] tidyselect_1.2.1 rstudioapi_0.17.1 knitr_1.50
## [55] farver_2.1.2 htmltools_0.5.8.1 labeling_0.4.3
## [58] rmarkdown_2.30 ipred_0.9-15 timeDate_4051.111
## [61] gower_1.0.2 compiler_4.5.2 S7_0.2.0