knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.align = "center")
pkgs <- c("AmesHousing","rsample","dplyr","ggplot2","visdat",
"caret","recipes","reshape2","forecast","tidyr","purrr","broom","scales")
install.packages(setdiff(pkgs, rownames(installed.packages())), repos = "https://cloud.r-project.org")
3.1 Prerequisites
library(dplyr); library(ggplot2); library(visdat)
library(caret); library(recipes)
library(purrr); library(tidyr); library(broom)
library(AmesHousing); library(rsample); library(scales)
ggplot2::theme_set(ggplot2::theme_light())
ames <- AmesHousing::make_ames()
set.seed(123)
split <- rsample::initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test <- rsample::testing(split)
3.2 Target Engineering
models <- c("Non-log transformed model residuals", "Log transformed model residuals")
list(
m1 = lm(Sale_Price ~ Year_Built, data = ames_train),
m2 = lm(log(Sale_Price) ~ Year_Built, data = ames_train)
) %>%
map2_dfr(models, ~ broom::augment(.x) %>% mutate(model = .y)) %>%
ggplot(aes(.resid)) +
geom_histogram(bins = 75) +
facet_wrap(~ model, scales = "free_x") +
ylab(NULL) + xlab("Residuals")

ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes())
ames_recipe
log(-0.5)
## [1] NaN
log1p(-0.5)
## [1] -0.6931472
recipe(Sale_Price ~ ., data = ames_train) %>%
step_BoxCox(all_outcomes())
lambda <- forecast::BoxCox.lambda(ames_train$Sale_Price)
train_bc_y <- forecast::BoxCox(ames_train$Sale_Price, lambda)
data.frame(
Normal = ames_train$Sale_Price,
Log_Transform = log(ames_train$Sale_Price),
BoxCox_Transform = train_bc_y
) %>%
gather(Transform, Value) %>%
mutate(Transform = factor(Transform, levels = c("Normal","Log_Transform","BoxCox_Transform"))) %>%
ggplot(aes(Value, fill = Transform)) +
geom_histogram(show.legend = FALSE, bins = 40) +
facet_wrap(~ Transform, scales = "free_x") +
xlab("Value") + ylab("count")

y <- log(10)
exp(y)
## [1] 10
y <- forecast::BoxCox(10, lambda)
inv_box_cox <- function(x, lambda) {
if (lambda == 0) exp(x) else (lambda * x + 1)^(1 / lambda)
}
inv_box_cox(y, lambda)
## [1] 10
## attr(,"lambda")
## [1] 0.138742
3.3 Dealing with Missingness
sum(is.na(AmesHousing::ames_raw))
## [1] 13997
# Figure 3.3 - horizontal (no coord_flip)
AmesHousing::ames_raw %>%
is.na() %>%
reshape2::melt() %>%
ggplot(aes(Var1, Var2, fill = value)) +
geom_raster() +
scale_x_continuous(NULL, expand = c(0, 0)) +
scale_fill_grey(name = "", labels = c("Present", "Missing")) +
ylab("Observation") +
theme(axis.text.x = element_text(size = 4, angle = 90, hjust = 1))

# Figure 3.4 - wider plot so column names are fully visible
vis_miss(AmesHousing::ames_raw, cluster = TRUE) +
theme(axis.text.x = element_text(size = 6, angle = 90, hjust = 1))

AmesHousing::ames_raw %>%
filter(is.na(`Garage Type`)) %>%
select(`Garage Type`, `Garage Cars`, `Garage Area`)
## # A tibble: 157 × 3
## `Garage Type` `Garage Cars` `Garage Area`
## <chr> <int> <int>
## 1 <NA> 0 0
## 2 <NA> 0 0
## 3 <NA> 0 0
## 4 <NA> 0 0
## 5 <NA> 0 0
## 6 <NA> 0 0
## 7 <NA> 0 0
## 8 <NA> 0 0
## 9 <NA> 0 0
## 10 <NA> 0 0
## # ℹ 147 more rows
ames_recipe %>% step_impute_median(Gr_Liv_Area)
ames_recipe %>% step_impute_knn(all_predictors(), neighbors = 6)
ames_recipe %>% step_impute_bag(all_predictors())
# Figure 3.5 - fix grey background dots showing in all panels
set.seed(123)
sub_data <- ames_train %>%
select(Gr_Liv_Area, Sale_Price, Lot_Area, Year_Built, Garage_Area) %>%
mutate(across(everything(), as.numeric))
missing_idx <- sample(nrow(sub_data), 75)
actual_vals <- sub_data$Gr_Liv_Area[missing_idx]
sub_missing <- sub_data
sub_missing$Gr_Liv_Area[missing_idx] <- NA
# Mean imputation
imp_mean <- sub_missing
imp_mean$Gr_Liv_Area[missing_idx] <- mean(sub_data$Gr_Liv_Area)
# KNN imputation
imp_knn <- prep(
recipe(Sale_Price ~ ., data = sub_missing) %>%
step_impute_knn(Gr_Liv_Area, neighbors = 5),
training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)
# Bagged tree imputation
imp_bag <- prep(
recipe(Sale_Price ~ ., data = sub_missing) %>%
step_impute_bag(Gr_Liv_Area),
training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)
# Build tidy plot data - background points in ALL panels
all_bg <- data.frame(
Gr_Liv_Area = sub_data$Gr_Liv_Area,
Sale_Price = sub_data$Sale_Price,
type = "background"
)
make_panel <- function(imp_vec, label) {
bind_rows(
all_bg %>% mutate(Panel = label),
data.frame(
Gr_Liv_Area = imp_vec[missing_idx],
Sale_Price = sub_data$Sale_Price[missing_idx],
type = "imputed",
Panel = label
),
data.frame(
Gr_Liv_Area = actual_vals,
Sale_Price = sub_data$Sale_Price[missing_idx],
type = "actual",
Panel = label
)
)
}
plot_df <- bind_rows(
# Actual values panel
bind_rows(
all_bg %>% mutate(
type = ifelse(seq_len(nrow(sub_data)) %in% missing_idx, "actual", "background"),
Panel = "Actual values"
)
),
make_panel(imp_mean$Gr_Liv_Area, "Mean Imputation"),
make_panel(imp_knn, "KNN Imputation"),
make_panel(imp_bag, "Bagged Trees Imputation")
) %>%
mutate(Panel = factor(Panel, levels = c("Actual values","Mean Imputation",
"KNN Imputation","Bagged Trees Imputation")))
ggplot(plot_df, aes(x = Gr_Liv_Area, y = Sale_Price, colour = type, alpha = type, size = type)) +
geom_point() +
facet_wrap(~ Panel, ncol = 2) +
scale_colour_manual(
values = c(background = "grey60", actual = "#D62728", imputed = "#1F77B4"),
labels = c("Background", "Actual (removed)", "Imputed"),
name = NULL
) +
scale_alpha_manual(values = c(background = 0.3, actual = 0.9, imputed = 0.9), guide = "none") +
scale_size_manual(values = c(background = 1, actual = 2, imputed = 2), guide = "none") +
scale_x_log10() + scale_y_log10() +
labs(x = "Gr_Liv_Area", y = "Sale_Price") +
theme(legend.position = "bottom")

3.4 Feature Filtering
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
tibble::rownames_to_column() %>%
filter(nzv)
## rowname freqRatio percentUnique zeroVar nzv
## 1 Street 226.66667 0.09760859 FALSE TRUE
## 2 Alley 24.25316 0.14641288 FALSE TRUE
## 3 Land_Contour 19.50000 0.19521718 FALSE TRUE
## 4 Utilities 1023.00000 0.14641288 FALSE TRUE
## 5 Land_Slope 22.15909 0.14641288 FALSE TRUE
## 6 Condition_2 202.60000 0.34163006 FALSE TRUE
## 7 Roof_Matl 144.35714 0.39043436 FALSE TRUE
## 8 Bsmt_Cond 20.24444 0.29282577 FALSE TRUE
## 9 BsmtFin_Type_2 25.85294 0.34163006 FALSE TRUE
## 10 BsmtFin_SF_2 453.25000 9.37042460 FALSE TRUE
## 11 Heating 106.00000 0.29282577 FALSE TRUE
## 12 Low_Qual_Fin_SF 1010.50000 1.31771596 FALSE TRUE
## 13 Kitchen_AbvGr 21.23913 0.19521718 FALSE TRUE
## 14 Functional 38.89796 0.39043436 FALSE TRUE
## 15 Enclosed_Porch 102.05882 7.41825281 FALSE TRUE
## 16 Three_season_porch 673.66667 1.12249878 FALSE TRUE
## 17 Screen_Porch 169.90909 4.63640800 FALSE TRUE
## 18 Pool_Area 2039.00000 0.53684724 FALSE TRUE
## 19 Pool_QC 509.75000 0.24402147 FALSE TRUE
## 20 Misc_Feature 34.18966 0.24402147 FALSE TRUE
## 21 Misc_Val 180.54545 1.56173743 FALSE TRUE
ames_recipe %>% step_nzv(all_predictors())
# Figure 3.6 - RMSE profiles (reconstructed from book values)
n_steps <- c(0, 100, 200, 300, 400, 500)
rmse_data <- bind_rows(
# Linear models
data.frame(n=n_steps, RMSE=c(0.68,0.75,0.82,0.92,1.10,1.30), model="Elastic net", Panel="Linear models"),
data.frame(n=n_steps, RMSE=c(0.67,0.70,0.72,0.73,0.73,0.74), model="Lasso", Panel="Linear models"),
data.frame(n=n_steps, RMSE=c(0.68,0.75,0.82,0.92,1.10,1.30), model="Linear regression", Panel="Linear models"),
data.frame(n=n_steps, RMSE=c(0.72,0.82,0.93,1.13,1.38,1.52), model="Partial least squares", Panel="Linear models"),
data.frame(n=n_steps, RMSE=c(0.73,0.82,0.88,0.92,0.96,0.99), model="Principal component regression", Panel="Linear models"),
# Non-linear models
data.frame(n=n_steps, RMSE=c(0.68,0.68,0.68,0.68,0.68,0.68), model="Multivariate adaptive regression splines", Panel="Non-linear Models"),
data.frame(n=n_steps, RMSE=c(0.89,0.93,0.93,0.94,0.93,0.95), model="Neural net", Panel="Non-linear Models"),
data.frame(n=n_steps, RMSE=c(0.74,0.88,1.20,1.22,0.97,1.58), model="Support vector machine", Panel="Non-linear Models"),
# Tree-based models
data.frame(n=n_steps, RMSE=c(0.67,0.68,0.69,0.70,0.72,0.74), model="Gradient boosting machines", Panel="Tree-based Models"),
data.frame(n=n_steps, RMSE=c(0.69,0.71,0.73,0.75,0.75,0.75), model="Random forest", Panel="Tree-based Models")
) %>%
mutate(Panel = factor(Panel, levels = c("Linear models","Non-linear Models","Tree-based Models")))
ggplot(rmse_data, aes(x = n, y = RMSE, color = model, linetype = model)) +
geom_line(size = 0.8) +
geom_point(size = 2) +
facet_wrap(~ Panel) +
labs(
x = "Number of additional non-informative predictors",
y = "RMSE",
color = "model", linetype = "model"
) +
theme(legend.position = "right",
legend.text = element_text(size = 8),
legend.key.width = unit(1.5, "cm"))

# Figure 3.7 - Training time increase (reconstructed from book values)
time_data <- bind_rows(
# Linear models
data.frame(n=n_steps, pct=c(0,70,115,180,430,630), model="Elastic net", Panel="Linear models"),
data.frame(n=n_steps, pct=c(0,25, 45, 85,185,285), model="Lasso", Panel="Linear models"),
data.frame(n=n_steps, pct=c(0,10, 40,160,270,280), model="Linear regression", Panel="Linear models"),
data.frame(n=n_steps, pct=c(0,60,120,220,450,810), model="Partial least squares", Panel="Linear models"),
data.frame(n=n_steps, pct=c(0,55,110,200,320,340), model="Principal component regression", Panel="Linear models"),
# Non-linear models
data.frame(n=n_steps, pct=c(0, 5, 15, 30, 55, 80), model="Multivariate adaptive regression splines", Panel="Non-linear Models"),
data.frame(n=n_steps, pct=c(0,60,130,280,490,840), model="Neural net", Panel="Non-linear Models"),
data.frame(n=n_steps, pct=c(0,95,230,465,660,840), model="Support vector machine", Panel="Non-linear Models"),
# Tree-based models
data.frame(n=n_steps, pct=c(0,15, 55, 90,135,240), model="Gradient boosting machines", Panel="Tree-based Models"),
data.frame(n=n_steps, pct=c(0, 5, 20, 55, 90,150), model="Random forest", Panel="Tree-based Models")
) %>%
mutate(Panel = factor(Panel, levels = c("Linear models","Non-linear Models","Tree-based Models")))
ggplot(time_data, aes(x = n, y = pct, color = model, linetype = model)) +
geom_line(size = 0.8) +
geom_point(size = 2) +
facet_wrap(~ Panel) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
labs(
x = "Number of additional non-informative predictors",
y = "Percent increase in training time",
color = "model", linetype = "model"
) +
theme(legend.position = "right",
legend.text = element_text(size = 8),
legend.key.width = unit(1.5, "cm"))

3.5 Numeric Feature Engineering
recipe(Sale_Price ~ ., data = ames_train) %>%
step_YeoJohnson(all_numeric())
ames_recipe %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes())
# Figure 3.8 - match original: x1 near 0, x2 spread wide, x3 near 140-160
set.seed(7)
n <- 50
feat_df <- data.frame(
x1 = rnorm(n, 0, 1),
x2 = c(rnorm(40, 20, 15), runif(10, 200, 250)), # spread with outliers like original
x3 = rnorm(n, 145, 8)
)
std_df <- as.data.frame(scale(feat_df))
bind_rows(
pivot_longer(feat_df, everything(), names_to = "Feature", values_to = "Value") %>%
mutate(Scale = "Real value"),
pivot_longer(std_df, everything(), names_to = "Feature", values_to = "Value") %>%
mutate(Scale = "Standardized value")
) %>%
mutate(Scale = factor(Scale, levels = c("Real value", "Standardized value"))) %>%
ggplot(aes(x = Value, y = Feature)) +
geom_jitter(height = 0.07, alpha = 0.6, size = 1.8) +
facet_wrap(~ Scale, scales = "free_x") +
labs(x = "Value", y = "Feature")

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Taipei
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 rsample_1.3.1 AmesHousing_0.0.4 broom_1.0.10
## [5] tidyr_1.3.1 purrr_1.1.0 recipes_1.3.1 caret_7.0-1
## [9] lattice_0.22-7 visdat_0.6.0 ggplot2_4.0.0 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4051.111 farver_2.1.2
## [4] S7_0.2.0 fastmap_1.2.0 pROC_1.19.0.1
## [7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
## [10] lifecycle_1.0.4 survival_3.8-3 magrittr_2.0.4
## [13] compiler_4.5.1 rlang_1.1.6 sass_0.4.10
## [16] tools_4.5.1 utf8_1.2.6 yaml_2.3.10
## [19] data.table_1.17.8 knitr_1.50 labeling_0.4.3
## [22] curl_7.0.0 TTR_0.24.4 plyr_1.8.9
## [25] RColorBrewer_1.1-3 withr_3.0.2 nnet_7.3-20
## [28] grid_4.5.1 stats4_4.5.1 xts_0.14.1
## [31] colorspace_2.1-2 future_1.67.0 globals_0.18.0
## [34] iterators_1.0.14 MASS_7.3-65 cli_3.6.5
## [37] rmarkdown_2.30 generics_0.1.4 rstudioapi_0.17.1
## [40] future.apply_1.20.0 reshape2_1.4.5 cachem_1.1.0
## [43] stringr_1.5.2 splines_4.5.1 forecast_8.24.0
## [46] parallel_4.5.1 urca_1.3-4 vctrs_0.6.5
## [49] hardhat_1.4.2 Matrix_1.7-3 jsonlite_2.0.0
## [52] tseries_0.10-58 listenv_0.9.1 foreach_1.5.2
## [55] gower_1.0.2 jquerylib_0.1.4 quantmod_0.4.28
## [58] glue_1.8.0 parallelly_1.45.1 codetools_0.2-20
## [61] lubridate_1.9.4 stringi_1.8.7 gtable_0.3.6
## [64] quadprog_1.5-8 lmtest_0.9-40 tibble_3.3.0
## [67] pillar_1.11.1 furrr_0.3.1 htmltools_0.5.8.1
## [70] ipred_0.9-15 lava_1.8.1 R6_2.6.1
## [73] evaluate_1.0.5 backports_1.5.0 fracdiff_1.5-3
## [76] bslib_0.9.0 class_7.3-23 Rcpp_1.1.0
## [79] nlme_3.1-168 prodlim_2025.04.28 xfun_0.53
## [82] zoo_1.8-14 ModelMetrics_1.2.2.2 pkgconfig_2.0.3