Reproducing outputs from Sections 3.1–3.5 of Hands-On Machine Learning with R by Bradley Boehmke & Brandon Greenwell.


3.1 Prerequisites

# Packages used in this chapter
cat("recipes version :", as.character(packageVersion("recipes")),   "\n")
## recipes version : 1.3.1
cat("rsample version :", as.character(packageVersion("rsample")),   "\n")
## rsample version : 1.3.1
cat("AmesHousing     :", as.character(packageVersion("AmesHousing")), "\n")
## AmesHousing     : 0.0.4
# Confirm the train/test split
cat("\nTraining rows:", nrow(ames_train), "| columns:", ncol(ames_train))
## 
## Training rows: 2049 | columns: 81
cat("\nTest rows    :", nrow(ames_test),  "| columns:", ncol(ames_test), "\n")
## 
## Test rows    : 881 | columns: 81
# Peek at the response variable
head(ames_train$Sale_Price)
## [1] 105500  88000 120000 125000  67500 112000

3.2 Target Engineering

A right-skewed response can hurt model performance. Two popular fixes are a log transformation and a Box-Cox / Yeo-Johnson transformation.

Raw Sale_Price distribution

ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  scale_x_continuous(labels = dollar_format()) +
  labs(title = "Sale Price – original scale",
       x = "Sale Price ($)", y = "Count")

Log-transformed Sale_Price

ggplot(ames_train, aes(x = log(Sale_Price))) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  labs(title = "Sale Price – log-transformed",
       x = "log(Sale Price)", y = "Count")

Applying transformations inside a recipes blueprint

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(Sale_Price, base = 10)
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_BoxCox(Sale_Price)
# Yeo-Johnson: works even when values include zero or negatives
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(Sale_Price)

3.3 Dealing with Missingness

3.3.1 Visualising missing values

cat("Total NAs in raw Ames data:", sum(is.na(ames_raw)), "\n")
## Total NAs in raw Ames data: 13997
miss_mat <- is.na(ames_raw)

miss_df <- data.frame(
  row     = rep(seq_len(nrow(miss_mat)), ncol(miss_mat)),
  col     = rep(colnames(miss_mat), each = nrow(miss_mat)),
  missing = as.vector(miss_mat)
)

ggplot(miss_df, aes(x = col, y = row, fill = missing)) +
  geom_raster() +
  coord_flip() +
  scale_fill_manual(
    name   = "",
    values = c("FALSE" = "grey80", "TRUE" = "#E74C3C"),
    labels = c("Present", "Missing")
  ) +
  scale_y_continuous(NULL, expand = c(0, 0)) +
  labs(title = "Missing values – raw Ames housing data", x = NULL) +
  theme_minimal(base_size = 8) +
  theme(axis.text.y = element_text(size = 6))

vis_miss(ames_raw, cluster = FALSE) +
  labs(title = "vis_miss() – raw Ames housing data")

Informative missingness

Properties with no garage have NA for all garage-related fields — the missingness is informative, not random.

ames_raw %>%
  filter(is.na(`Garage Type`)) %>%
  select(`Garage Type`, `Garage Cars`, `Garage Area`) %>%
  head(10)
## # A tibble: 10 × 3
##    `Garage Type` `Garage Cars` `Garage Area`
##    <chr>                 <int>         <int>
##  1 <NA>                      0             0
##  2 <NA>                      0             0
##  3 <NA>                      0             0
##  4 <NA>                      0             0
##  5 <NA>                      0             0
##  6 <NA>                      0             0
##  7 <NA>                      0             0
##  8 <NA>                      0             0
##  9 <NA>                      0             0
## 10 <NA>                      0             0

3.3.2 Imputation

3.3.2.1 Estimated-statistic (mean) imputation

# Handles both old (<0.1.16) and new (>=0.1.16) recipes naming
if ("step_impute_mean" %in% getNamespaceExports("recipes")) {
  rec_mean <- recipe(Sale_Price ~ ., data = ames_train) %>%
    step_impute_mean(all_numeric_predictors())
} else {
  rec_mean <- recipe(Sale_Price ~ ., data = ames_train) %>%
    step_meanimpute(all_numeric())
}
rec_mean

3.3.2.2 K-Nearest Neighbour imputation

if ("step_impute_knn" %in% getNamespaceExports("recipes")) {
  rec_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
    step_impute_knn(all_predictors(), neighbors = 5)
} else {
  rec_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
    step_knnimpute(all_predictors(), neighbors = 5)
}
rec_knn

3.3.2.3 Bagged-tree imputation

if ("step_impute_bag" %in% getNamespaceExports("recipes")) {
  rec_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
    step_impute_bag(all_predictors())
} else {
  rec_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
    step_bagimpute(all_predictors())
}
rec_bag

3.4 Feature Filtering

Features with near-zero variance carry almost no information and can destabilise some models. step_nzv() removes them automatically.

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors())
nzv_prep <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors()) %>%
  prep(training = ames_train)

flagged <- tidy(nzv_prep, number = 1)
cat("Variables flagged as near-zero variance:", nrow(flagged), "\n\n")
## Variables flagged as near-zero variance: 21
print(flagged)
## # A tibble: 21 × 2
##    terms          id       
##    <chr>          <chr>    
##  1 Street         nzv_w5w6w
##  2 Alley          nzv_w5w6w
##  3 Land_Contour   nzv_w5w6w
##  4 Utilities      nzv_w5w6w
##  5 Land_Slope     nzv_w5w6w
##  6 Condition_2    nzv_w5w6w
##  7 Roof_Matl      nzv_w5w6w
##  8 Bsmt_Cond      nzv_w5w6w
##  9 BsmtFin_Type_2 nzv_w5w6w
## 10 BsmtFin_SF_2   nzv_w5w6w
## # ℹ 11 more rows

3.5 Numeric Feature Engineering

3.5.1 Skewness

Right-skewed predictors can degrade algorithms that assume approximate normality. We compare distributions before and after a Yeo-Johnson transform.

ames_train %>%
  select(Lot_Area, Gr_Liv_Area, TotRms_AbvGrd) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
  facet_wrap(~variable, scales = "free") +
  labs(title = "Numeric predictors – BEFORE transformation",
       x = NULL, y = "Count")

rec_yj <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors())

baked_yj <- rec_yj %>%
  prep(training = ames_train) %>%
  bake(new_data = ames_train)

baked_yj %>%
  select(Lot_Area, Gr_Liv_Area, TotRms_AbvGrd) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 40, fill = "darkorange", colour = "white") +
  facet_wrap(~variable, scales = "free") +
  labs(title = "Numeric predictors – AFTER Yeo-Johnson transformation",
       x = NULL, y = "Count")

3.5.2 Standardisation

Distance-based and regularised models need features on a common scale. step_normalize() subtracts the mean and divides by the SD (mean = 0, SD = 1).

rec_std <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_normalize(all_numeric_predictors())

rec_std
baked_std <- rec_std %>%
  prep(training = ames_train) %>%
  bake(new_data = ames_train)

p1 <- ggplot(ames_train, aes(x = Gr_Liv_Area)) +
  geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
  labs(title = "Original", x = "Gr_Liv_Area", y = "Count")

p2 <- ggplot(baked_std, aes(x = Gr_Liv_Area)) +
  geom_histogram(bins = 40, fill = "darkorange", colour = "white") +
  labs(title = "Standardised (mean=0, SD=1)", x = "Gr_Liv_Area", y = "Count")

grid.arrange(p1, p2, ncol = 2,
             top = "Gr_Liv_Area: before vs. after standardisation")

baked_std %>%
  summarise(
    mean_GrLivArea = round(mean(Gr_Liv_Area), 6),
    sd_GrLivArea   = round(sd(Gr_Liv_Area),   6)
  )
## # A tibble: 1 × 2
##   mean_GrLivArea sd_GrLivArea
##            <dbl>        <dbl>
## 1              0            1

Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] scales_1.4.0      gridExtra_2.3     tidyr_1.3.1       rsample_1.3.1    
## [5] AmesHousing_0.0.4 recipes_1.3.1     visdat_0.6.0      ggplot2_4.0.1    
## [9] dplyr_1.1.4      
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.6          sass_0.4.10         future_1.68.0      
##  [4] generics_0.1.4      class_7.3-23        lattice_0.22-7     
##  [7] listenv_0.10.0      digest_0.6.37       magrittr_2.0.3     
## [10] RColorBrewer_1.1-3  timechange_0.3.0    evaluate_1.0.5     
## [13] grid_4.5.2          fastmap_1.2.0       jsonlite_2.0.0     
## [16] Matrix_1.7-4        nnet_7.3-20         survival_3.8-3     
## [19] purrr_1.1.0         codetools_0.2-20    jquerylib_0.1.4    
## [22] lava_1.8.2          cli_3.6.5           rlang_1.1.6        
## [25] hardhat_1.4.2       parallelly_1.46.0   future.apply_1.20.1
## [28] splines_4.5.2       withr_3.0.2         cachem_1.1.0       
## [31] yaml_2.3.10         prodlim_2025.04.28  tools_4.5.2        
## [34] parallel_4.5.2      globals_0.18.0      vctrs_0.6.5        
## [37] R6_2.6.1            rpart_4.1.24        lifecycle_1.0.4    
## [40] lubridate_1.9.4     MASS_7.3-65         furrr_0.3.1        
## [43] pkgconfig_2.0.3     gtable_0.3.6        pillar_1.11.1      
## [46] bslib_0.9.0         glue_1.8.0          data.table_1.17.8  
## [49] Rcpp_1.1.0          xfun_0.56           tibble_3.3.0       
## [52] tidyselect_1.2.1    rstudioapi_0.17.1   knitr_1.50         
## [55] farver_2.1.2        htmltools_0.5.8.1   labeling_0.4.3     
## [58] rmarkdown_2.30      ipred_0.9-15        timeDate_4051.111  
## [61] gower_1.0.2         compiler_4.5.2      S7_0.2.0