Reproducing outputs from Sections 3.1–3.5 of Hands-On Machine Learning with R by Brad Boehmke & Brandon Greenwell.


3.1 Prerequisites

# Helper packages
library(dplyr)    # for data manipulation
library(ggplot2)  # for awesome graphics
library(visdat)   # for additional visualizations

# Feature engineering packages
library(caret)    # for various ML tasks
library(recipes)  # for feature engineering tasks

We continue with the ames_train data set created in Section 2.7:

library(rsample)

ames <- AmesHousing::make_ames()

set.seed(123)
split      <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test  <- testing(split)

3.2 Target Engineering

Visualising the skew in Sale_Price

library(gridExtra)

p1 <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  scale_x_continuous("Sale Price", labels = scales::dollar) +
  labs(title = "Original Sale_Price") +
  theme_minimal()

p2 <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
  scale_x_log10("Sale Price (log10 scale)", labels = scales::dollar) +
  labs(title = "Log-Transformed Sale_Price") +
  theme_minimal()

grid.arrange(p1, p2, nrow = 1)
Figure 3.1: Transforming the response variable to minimize skewness can resolve concerns with non-normally distributed errors.

Figure 3.1: Transforming the response variable to minimize skewness can resolve concerns with non-normally distributed errors.

Option 1 – Log transformation (manual)

transformed_response <- log(ames_train$Sale_Price)

Option 1 – Log transformation (recipe blueprint)

# log transformation
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe

Handling non-positive values with log1p()

log(-0.5)
## [1] NaN
log1p(-0.5)
## [1] -0.6931472

Option 2 – Box-Cox transformation

library(forecast)

# Estimate lambda on training data
lambda <- BoxCox.lambda(ames_train$Sale_Price)
lambda
## [1] 0.138742
# Apply Box-Cox
bc_transformed <- BoxCox(ames_train$Sale_Price, lambda)

p3 <- ggplot(data.frame(x = bc_transformed), aes(x)) +
  geom_histogram(bins = 50, fill = "tomato", colour = "white") +
  labs(title = paste0("Box-Cox (λ = ", round(lambda, 3), ")"), x = "Transformed Sale_Price") +
  theme_minimal()

grid.arrange(p2, p3, nrow = 1)
Figure 3.2: Response variable transformations.

Figure 3.2: Response variable transformations.

Inverse transformations

# Log transform a value
y <- log(10)

# Undo log-transformation
exp(y)
## [1] 10
# Box Cox transform a value
y <- forecast::BoxCox(10, lambda)

# Inverse Box Cox function
inv_box_cox <- function(x, lambda) {
  # for Box-Cox, lambda = 0 --> log transform
  if (lambda == 0) exp(x) else (lambda * x + 1)^(1 / lambda)
}

# Undo Box Cox-transformation
inv_box_cox(y, lambda)
## [1] 10
## attr(,"lambda")
## [1] 0.138742

3.3 Dealing with Missingness

Total missing values in the raw Ames data

sum(is.na(AmesHousing::ames_raw))
## [1] 13997

3.3.1 Visualizing Missing Values

Figure 3.3 – Heat map via geom_raster()

AmesHousing::ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill = value)) +
    geom_raster() +
    coord_flip() +
    scale_y_continuous(NULL, expand = c(0, 0)) +
    scale_fill_grey(name   = "",
                    labels = c("Present", "Missing")) +
    xlab("Observation") +
    theme(axis.text.y = element_text(size = 4))
Figure 3.3: Heat map of missing values in the raw Ames housing data.

Figure 3.3: Heat map of missing values in the raw Ames housing data.

Investigating garage variables

AmesHousing::ames_raw %>%
  filter(is.na(`Garage Type`)) %>%
  select(`Garage Type`, `Garage Cars`, `Garage Area`)
## # A tibble: 157 × 3
##    `Garage Type` `Garage Cars` `Garage Area`
##    <chr>                 <int>         <int>
##  1 <NA>                      0             0
##  2 <NA>                      0             0
##  3 <NA>                      0             0
##  4 <NA>                      0             0
##  5 <NA>                      0             0
##  6 <NA>                      0             0
##  7 <NA>                      0             0
##  8 <NA>                      0             0
##  9 <NA>                      0             0
## 10 <NA>                      0             0
## # ℹ 147 more rows

Figure 3.4 – vis_miss() clustered heat map

vis_miss(AmesHousing::ames_raw, cluster = TRUE)
Figure 3.4: Visualizing missing data patterns in the raw Ames housing data.

Figure 3.4: Visualizing missing data patterns in the raw Ames housing data.

3.3.2 Imputation

3.3.2.1 Estimated statistic (median)

ames_recipe %>%
  step_impute_median(Gr_Liv_Area)

3.3.2.2 K-Nearest Neighbor

ames_recipe %>%
  step_impute_knn(all_predictors(), neighbors = 6)

3.3.2.3 Tree-based (bagged trees)

ames_recipe %>%
  step_impute_bag(all_predictors())

3.4 Feature Filtering

caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
  tibble::rownames_to_column() %>%
  filter(nzv)
##               rowname  freqRatio percentUnique zeroVar  nzv
## 1              Street  226.66667    0.09760859   FALSE TRUE
## 2               Alley   24.25316    0.14641288   FALSE TRUE
## 3        Land_Contour   19.50000    0.19521718   FALSE TRUE
## 4           Utilities 1023.00000    0.14641288   FALSE TRUE
## 5          Land_Slope   22.15909    0.14641288   FALSE TRUE
## 6         Condition_2  202.60000    0.34163006   FALSE TRUE
## 7           Roof_Matl  144.35714    0.39043436   FALSE TRUE
## 8           Bsmt_Cond   20.24444    0.29282577   FALSE TRUE
## 9      BsmtFin_Type_2   25.85294    0.34163006   FALSE TRUE
## 10       BsmtFin_SF_2  453.25000    9.37042460   FALSE TRUE
## 11            Heating  106.00000    0.29282577   FALSE TRUE
## 12    Low_Qual_Fin_SF 1010.50000    1.31771596   FALSE TRUE
## 13      Kitchen_AbvGr   21.23913    0.19521718   FALSE TRUE
## 14         Functional   38.89796    0.39043436   FALSE TRUE
## 15     Enclosed_Porch  102.05882    7.41825281   FALSE TRUE
## 16 Three_season_porch  673.66667    1.12249878   FALSE TRUE
## 17       Screen_Porch  169.90909    4.63640800   FALSE TRUE
## 18          Pool_Area 2039.00000    0.53684724   FALSE TRUE
## 19            Pool_QC  509.75000    0.24402147   FALSE TRUE
## 20       Misc_Feature   34.18966    0.24402147   FALSE TRUE
## 21           Misc_Val  180.54545    1.56173743   FALSE TRUE

Add zero- and near-zero variance steps to the recipe:

ames_recipe %>%
  step_zv(all_predictors()) %>%
  step_nzv(all_predictors())

3.5 Numeric Feature Engineering

3.5.1 Skewness

# Normalize all numeric columns
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric())

3.5.2 Standardization

ames_recipe %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes())

Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] forecast_8.24.0 gridExtra_2.3   rsample_1.3.1   recipes_1.3.1  
## [5] caret_7.0-1     lattice_0.22-7  visdat_0.6.0    ggplot2_4.0.0  
## [9] dplyr_1.1.4    
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4051.111    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.4      AmesHousing_0.0.4    survival_3.8-3      
## [13] magrittr_2.0.3       compiler_4.5.2       rlang_1.1.6         
## [16] sass_0.4.10          tools_4.5.2          utf8_1.2.6          
## [19] yaml_2.3.10          data.table_1.17.8    knitr_1.50          
## [22] labeling_0.4.3       curl_7.0.0           TTR_0.24.4          
## [25] plyr_1.8.9           RColorBrewer_1.1-3   withr_3.0.2         
## [28] purrr_1.1.0          nnet_7.3-20          grid_4.5.2          
## [31] stats4_4.5.2         xts_0.14.1           colorspace_2.1-2    
## [34] future_1.67.0        globals_0.18.0       scales_1.4.0        
## [37] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [40] rmarkdown_2.29       generics_0.1.4       rstudioapi_0.17.1   
## [43] future.apply_1.20.0  reshape2_1.4.5       cachem_1.1.0        
## [46] stringr_1.5.2        splines_4.5.2        parallel_4.5.2      
## [49] urca_1.3-4           vctrs_0.6.5          hardhat_1.4.2       
## [52] Matrix_1.7-4         jsonlite_2.0.0       tseries_0.10-58     
## [55] listenv_0.9.1        foreach_1.5.2        gower_1.0.2         
## [58] jquerylib_0.1.4      tidyr_1.3.1          quantmod_0.4.28     
## [61] glue_1.8.0           parallelly_1.45.1    codetools_0.2-20    
## [64] lubridate_1.9.4      stringi_1.8.7        gtable_0.3.6        
## [67] quadprog_1.5-8       lmtest_0.9-40        tibble_3.3.0        
## [70] pillar_1.11.1        furrr_0.3.1          htmltools_0.5.8.1   
## [73] ipred_0.9-15         lava_1.8.2           R6_2.6.1            
## [76] evaluate_1.0.5       fracdiff_1.5-3       bslib_0.9.0         
## [79] class_7.3-23         Rcpp_1.1.0           nlme_3.1-168        
## [82] prodlim_2025.04.28   xfun_0.53            zoo_1.8-14          
## [85] ModelMetrics_1.2.2.2 pkgconfig_2.0.3