Reproducing outputs from Sections 3.1–3.5 of Hands-On Machine Learning with R by Brad Boehmke & Brandon Greenwell.
# Helper packages
library(dplyr) # for data manipulation
library(ggplot2) # for awesome graphics
library(visdat) # for additional visualizations
# Feature engineering packages
library(caret) # for various ML tasks
library(recipes) # for feature engineering tasksWe continue with the ames_train data set created in
Section 2.7:
library(rsample)
ames <- AmesHousing::make_ames()
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)Sale_Pricelibrary(gridExtra)
p1 <- ggplot(ames_train, aes(x = Sale_Price)) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
scale_x_continuous("Sale Price", labels = scales::dollar) +
labs(title = "Original Sale_Price") +
theme_minimal()
p2 <- ggplot(ames_train, aes(x = Sale_Price)) +
geom_histogram(bins = 50, fill = "steelblue", colour = "white") +
scale_x_log10("Sale Price (log10 scale)", labels = scales::dollar) +
labs(title = "Log-Transformed Sale_Price") +
theme_minimal()
grid.arrange(p1, p2, nrow = 1)Figure 3.1: Transforming the response variable to minimize skewness can resolve concerns with non-normally distributed errors.
library(forecast)
# Estimate lambda on training data
lambda <- BoxCox.lambda(ames_train$Sale_Price)
lambda## [1] 0.138742
# Apply Box-Cox
bc_transformed <- BoxCox(ames_train$Sale_Price, lambda)
p3 <- ggplot(data.frame(x = bc_transformed), aes(x)) +
geom_histogram(bins = 50, fill = "tomato", colour = "white") +
labs(title = paste0("Box-Cox (λ = ", round(lambda, 3), ")"), x = "Transformed Sale_Price") +
theme_minimal()
grid.arrange(p2, p3, nrow = 1)Figure 3.2: Response variable transformations.
## [1] 10
# Box Cox transform a value
y <- forecast::BoxCox(10, lambda)
# Inverse Box Cox function
inv_box_cox <- function(x, lambda) {
# for Box-Cox, lambda = 0 --> log transform
if (lambda == 0) exp(x) else (lambda * x + 1)^(1 / lambda)
}
# Undo Box Cox-transformation
inv_box_cox(y, lambda)## [1] 10
## attr(,"lambda")
## [1] 0.138742
geom_raster()AmesHousing::ames_raw %>%
is.na() %>%
reshape2::melt() %>%
ggplot(aes(Var2, Var1, fill = value)) +
geom_raster() +
coord_flip() +
scale_y_continuous(NULL, expand = c(0, 0)) +
scale_fill_grey(name = "",
labels = c("Present", "Missing")) +
xlab("Observation") +
theme(axis.text.y = element_text(size = 4))Figure 3.3: Heat map of missing values in the raw Ames housing data.
AmesHousing::ames_raw %>%
filter(is.na(`Garage Type`)) %>%
select(`Garage Type`, `Garage Cars`, `Garage Area`)## # A tibble: 157 × 3
## `Garage Type` `Garage Cars` `Garage Area`
## <chr> <int> <int>
## 1 <NA> 0 0
## 2 <NA> 0 0
## 3 <NA> 0 0
## 4 <NA> 0 0
## 5 <NA> 0 0
## 6 <NA> 0 0
## 7 <NA> 0 0
## 8 <NA> 0 0
## 9 <NA> 0 0
## 10 <NA> 0 0
## # ℹ 147 more rows
## rowname freqRatio percentUnique zeroVar nzv
## 1 Street 226.66667 0.09760859 FALSE TRUE
## 2 Alley 24.25316 0.14641288 FALSE TRUE
## 3 Land_Contour 19.50000 0.19521718 FALSE TRUE
## 4 Utilities 1023.00000 0.14641288 FALSE TRUE
## 5 Land_Slope 22.15909 0.14641288 FALSE TRUE
## 6 Condition_2 202.60000 0.34163006 FALSE TRUE
## 7 Roof_Matl 144.35714 0.39043436 FALSE TRUE
## 8 Bsmt_Cond 20.24444 0.29282577 FALSE TRUE
## 9 BsmtFin_Type_2 25.85294 0.34163006 FALSE TRUE
## 10 BsmtFin_SF_2 453.25000 9.37042460 FALSE TRUE
## 11 Heating 106.00000 0.29282577 FALSE TRUE
## 12 Low_Qual_Fin_SF 1010.50000 1.31771596 FALSE TRUE
## 13 Kitchen_AbvGr 21.23913 0.19521718 FALSE TRUE
## 14 Functional 38.89796 0.39043436 FALSE TRUE
## 15 Enclosed_Porch 102.05882 7.41825281 FALSE TRUE
## 16 Three_season_porch 673.66667 1.12249878 FALSE TRUE
## 17 Screen_Porch 169.90909 4.63640800 FALSE TRUE
## 18 Pool_Area 2039.00000 0.53684724 FALSE TRUE
## 19 Pool_QC 509.75000 0.24402147 FALSE TRUE
## 20 Misc_Feature 34.18966 0.24402147 FALSE TRUE
## 21 Misc_Val 180.54545 1.56173743 FALSE TRUE
Add zero- and near-zero variance steps to the recipe:
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forecast_8.24.0 gridExtra_2.3 rsample_1.3.1 recipes_1.3.1
## [5] caret_7.0-1 lattice_0.22-7 visdat_0.6.0 ggplot2_4.0.0
## [9] dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4051.111 farver_2.1.2
## [4] S7_0.2.0 fastmap_1.2.0 pROC_1.19.0.1
## [7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
## [10] lifecycle_1.0.4 AmesHousing_0.0.4 survival_3.8-3
## [13] magrittr_2.0.3 compiler_4.5.2 rlang_1.1.6
## [16] sass_0.4.10 tools_4.5.2 utf8_1.2.6
## [19] yaml_2.3.10 data.table_1.17.8 knitr_1.50
## [22] labeling_0.4.3 curl_7.0.0 TTR_0.24.4
## [25] plyr_1.8.9 RColorBrewer_1.1-3 withr_3.0.2
## [28] purrr_1.1.0 nnet_7.3-20 grid_4.5.2
## [31] stats4_4.5.2 xts_0.14.1 colorspace_2.1-2
## [34] future_1.67.0 globals_0.18.0 scales_1.4.0
## [37] iterators_1.0.14 MASS_7.3-65 cli_3.6.5
## [40] rmarkdown_2.29 generics_0.1.4 rstudioapi_0.17.1
## [43] future.apply_1.20.0 reshape2_1.4.5 cachem_1.1.0
## [46] stringr_1.5.2 splines_4.5.2 parallel_4.5.2
## [49] urca_1.3-4 vctrs_0.6.5 hardhat_1.4.2
## [52] Matrix_1.7-4 jsonlite_2.0.0 tseries_0.10-58
## [55] listenv_0.9.1 foreach_1.5.2 gower_1.0.2
## [58] jquerylib_0.1.4 tidyr_1.3.1 quantmod_0.4.28
## [61] glue_1.8.0 parallelly_1.45.1 codetools_0.2-20
## [64] lubridate_1.9.4 stringi_1.8.7 gtable_0.3.6
## [67] quadprog_1.5-8 lmtest_0.9-40 tibble_3.3.0
## [70] pillar_1.11.1 furrr_0.3.1 htmltools_0.5.8.1
## [73] ipred_0.9-15 lava_1.8.2 R6_2.6.1
## [76] evaluate_1.0.5 fracdiff_1.5-3 bslib_0.9.0
## [79] class_7.3-23 Rcpp_1.1.0 nlme_3.1-168
## [82] prodlim_2025.04.28 xfun_0.53 zoo_1.8-14
## [85] ModelMetrics_1.2.2.2 pkgconfig_2.0.3