Chapter 3: Feature & Target Engineering (Sections 3.1

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.align = "center")

pkgs <- c("AmesHousing","rsample","dplyr","ggplot2","visdat",
          "caret","recipes","reshape2","forecast","tidyr","purrr","broom","scales")
install.packages(setdiff(pkgs, rownames(installed.packages())), repos = "https://cloud.r-project.org")

3.1 Prerequisites

library(dplyr); library(ggplot2); library(visdat)
library(caret); library(recipes)
library(purrr); library(tidyr); library(broom)
library(AmesHousing); library(rsample); library(scales)

ggplot2::theme_set(ggplot2::theme_light())

ames <- AmesHousing::make_ames()
set.seed(123)
split      <- rsample::initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test  <- rsample::testing(split)

3.2 Target Engineering

models <- c("Non-log transformed model residuals", "Log transformed model residuals")
list(
  m1 = lm(Sale_Price ~ Year_Built, data = ames_train),
  m2 = lm(log(Sale_Price) ~ Year_Built, data = ames_train)
) %>%
  map2_dfr(models, ~ broom::augment(.x) %>% mutate(model = .y)) %>%
  ggplot(aes(.resid)) +
  geom_histogram(bins = 75) +
  facet_wrap(~ model, scales = "free_x") +
  ylab(NULL) + xlab("Residuals")

ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())
ames_recipe

log(-0.5)

## [1] NaN

log1p(-0.5)

## [1] -0.6931472

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_BoxCox(all_outcomes())

lambda     <- forecast::BoxCox.lambda(ames_train$Sale_Price)
train_bc_y <- forecast::BoxCox(ames_train$Sale_Price, lambda)

data.frame(
  Normal           = ames_train$Sale_Price,
  Log_Transform    = log(ames_train$Sale_Price),
  BoxCox_Transform = train_bc_y
) %>%
  gather(Transform, Value) %>%
  mutate(Transform = factor(Transform, levels = c("Normal","Log_Transform","BoxCox_Transform"))) %>%
  ggplot(aes(Value, fill = Transform)) +
  geom_histogram(show.legend = FALSE, bins = 40) +
  facet_wrap(~ Transform, scales = "free_x") +
  xlab("Value") + ylab("count")

y <- log(10)
exp(y)

## [1] 10

y <- forecast::BoxCox(10, lambda)
inv_box_cox <- function(x, lambda) {
  if (lambda == 0) exp(x) else (lambda * x + 1)^(1 / lambda)
}
inv_box_cox(y, lambda)

## [1] 10
## attr(,"lambda")
## [1] 0.138742

3.3 Dealing with Missingness

sum(is.na(AmesHousing::ames_raw))

## [1] 13997

# Figure 3.3 - horizontal (no coord_flip)
AmesHousing::ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var1, Var2, fill = value)) +
  geom_raster() +
  scale_x_continuous(NULL, expand = c(0, 0)) +
  scale_fill_grey(name = "", labels = c("Present", "Missing")) +
  ylab("Observation") +
  theme(axis.text.x = element_text(size = 4, angle = 90, hjust = 1))

# Figure 3.4 - wider plot so column names are fully visible
vis_miss(AmesHousing::ames_raw, cluster = TRUE) +
  theme(axis.text.x = element_text(size = 6, angle = 90, hjust = 1))

AmesHousing::ames_raw %>%
  filter(is.na(`Garage Type`)) %>%
  select(`Garage Type`, `Garage Cars`, `Garage Area`)

## # A tibble: 157 × 3
##    `Garage Type` `Garage Cars` `Garage Area`
##    <chr>                 <int>         <int>
##  1 <NA>                      0             0
##  2 <NA>                      0             0
##  3 <NA>                      0             0
##  4 <NA>                      0             0
##  5 <NA>                      0             0
##  6 <NA>                      0             0
##  7 <NA>                      0             0
##  8 <NA>                      0             0
##  9 <NA>                      0             0
## 10 <NA>                      0             0
## # ℹ 147 more rows

ames_recipe %>% step_impute_median(Gr_Liv_Area)
ames_recipe %>% step_impute_knn(all_predictors(), neighbors = 6)
ames_recipe %>% step_impute_bag(all_predictors())

# Figure 3.5 - fix grey background dots showing in all panels
set.seed(123)

sub_data <- ames_train %>%
  select(Gr_Liv_Area, Sale_Price, Lot_Area, Year_Built, Garage_Area) %>%
  mutate(across(everything(), as.numeric))

missing_idx <- sample(nrow(sub_data), 75)
actual_vals <- sub_data$Gr_Liv_Area[missing_idx]
sub_missing <- sub_data
sub_missing$Gr_Liv_Area[missing_idx] <- NA

# Mean imputation
imp_mean <- sub_missing
imp_mean$Gr_Liv_Area[missing_idx] <- mean(sub_data$Gr_Liv_Area)

# KNN imputation
imp_knn <- prep(
  recipe(Sale_Price ~ ., data = sub_missing) %>%
    step_impute_knn(Gr_Liv_Area, neighbors = 5),
  training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)

# Bagged tree imputation
imp_bag <- prep(
  recipe(Sale_Price ~ ., data = sub_missing) %>%
    step_impute_bag(Gr_Liv_Area),
  training = sub_missing
) %>% bake(new_data = sub_missing) %>% pull(Gr_Liv_Area)

# Build tidy plot data - background points in ALL panels
all_bg <- data.frame(
  Gr_Liv_Area = sub_data$Gr_Liv_Area,
  Sale_Price  = sub_data$Sale_Price,
  type        = "background"
)

make_panel <- function(imp_vec, label) {
  bind_rows(
    all_bg %>% mutate(Panel = label),
    data.frame(
      Gr_Liv_Area = imp_vec[missing_idx],
      Sale_Price  = sub_data$Sale_Price[missing_idx],
      type        = "imputed",
      Panel       = label
    ),
    data.frame(
      Gr_Liv_Area = actual_vals,
      Sale_Price  = sub_data$Sale_Price[missing_idx],
      type        = "actual",
      Panel       = label
    )
  )
}

plot_df <- bind_rows(
  # Actual values panel
  bind_rows(
    all_bg %>% mutate(
      type  = ifelse(seq_len(nrow(sub_data)) %in% missing_idx, "actual", "background"),
      Panel = "Actual values"
    )
  ),
  make_panel(imp_mean$Gr_Liv_Area, "Mean Imputation"),
  make_panel(imp_knn,              "KNN Imputation"),
  make_panel(imp_bag,              "Bagged Trees Imputation")
) %>%
  mutate(Panel = factor(Panel, levels = c("Actual values","Mean Imputation",
                                          "KNN Imputation","Bagged Trees Imputation")))

ggplot(plot_df, aes(x = Gr_Liv_Area, y = Sale_Price, colour = type, alpha = type, size = type)) +
  geom_point() +
  facet_wrap(~ Panel, ncol = 2) +
  scale_colour_manual(
    values = c(background = "grey60", actual = "#D62728", imputed = "#1F77B4"),
    labels = c("Background", "Actual (removed)", "Imputed"),
    name   = NULL
  ) +
  scale_alpha_manual(values = c(background = 0.3, actual = 0.9, imputed = 0.9), guide = "none") +
  scale_size_manual(values  = c(background = 1,   actual = 2,   imputed = 2),   guide = "none") +
  scale_x_log10() + scale_y_log10() +
  labs(x = "Gr_Liv_Area", y = "Sale_Price") +
  theme(legend.position = "bottom")

3.4 Feature Filtering

caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
  tibble::rownames_to_column() %>%
  filter(nzv)

##               rowname  freqRatio percentUnique zeroVar  nzv
## 1              Street  226.66667    0.09760859   FALSE TRUE
## 2               Alley   24.25316    0.14641288   FALSE TRUE
## 3        Land_Contour   19.50000    0.19521718   FALSE TRUE
## 4           Utilities 1023.00000    0.14641288   FALSE TRUE
## 5          Land_Slope   22.15909    0.14641288   FALSE TRUE
## 6         Condition_2  202.60000    0.34163006   FALSE TRUE
## 7           Roof_Matl  144.35714    0.39043436   FALSE TRUE
## 8           Bsmt_Cond   20.24444    0.29282577   FALSE TRUE
## 9      BsmtFin_Type_2   25.85294    0.34163006   FALSE TRUE
## 10       BsmtFin_SF_2  453.25000    9.37042460   FALSE TRUE
## 11            Heating  106.00000    0.29282577   FALSE TRUE
## 12    Low_Qual_Fin_SF 1010.50000    1.31771596   FALSE TRUE
## 13      Kitchen_AbvGr   21.23913    0.19521718   FALSE TRUE
## 14         Functional   38.89796    0.39043436   FALSE TRUE
## 15     Enclosed_Porch  102.05882    7.41825281   FALSE TRUE
## 16 Three_season_porch  673.66667    1.12249878   FALSE TRUE
## 17       Screen_Porch  169.90909    4.63640800   FALSE TRUE
## 18          Pool_Area 2039.00000    0.53684724   FALSE TRUE
## 19            Pool_QC  509.75000    0.24402147   FALSE TRUE
## 20       Misc_Feature   34.18966    0.24402147   FALSE TRUE
## 21           Misc_Val  180.54545    1.56173743   FALSE TRUE

ames_recipe %>% step_nzv(all_predictors())

# Figure 3.6 - RMSE profiles (reconstructed from book values)
n_steps <- c(0, 100, 200, 300, 400, 500)

rmse_data <- bind_rows(
  # Linear models
  data.frame(n=n_steps, RMSE=c(0.68,0.75,0.82,0.92,1.10,1.30), model="Elastic net",              Panel="Linear models"),
  data.frame(n=n_steps, RMSE=c(0.67,0.70,0.72,0.73,0.73,0.74), model="Lasso",                    Panel="Linear models"),
  data.frame(n=n_steps, RMSE=c(0.68,0.75,0.82,0.92,1.10,1.30), model="Linear regression",        Panel="Linear models"),
  data.frame(n=n_steps, RMSE=c(0.72,0.82,0.93,1.13,1.38,1.52), model="Partial least squares",    Panel="Linear models"),
  data.frame(n=n_steps, RMSE=c(0.73,0.82,0.88,0.92,0.96,0.99), model="Principal component regression", Panel="Linear models"),
  # Non-linear models
  data.frame(n=n_steps, RMSE=c(0.68,0.68,0.68,0.68,0.68,0.68), model="Multivariate adaptive regression splines", Panel="Non-linear Models"),
  data.frame(n=n_steps, RMSE=c(0.89,0.93,0.93,0.94,0.93,0.95), model="Neural net",               Panel="Non-linear Models"),
  data.frame(n=n_steps, RMSE=c(0.74,0.88,1.20,1.22,0.97,1.58), model="Support vector machine",   Panel="Non-linear Models"),
  # Tree-based models
  data.frame(n=n_steps, RMSE=c(0.67,0.68,0.69,0.70,0.72,0.74), model="Gradient boosting machines", Panel="Tree-based Models"),
  data.frame(n=n_steps, RMSE=c(0.69,0.71,0.73,0.75,0.75,0.75), model="Random forest",            Panel="Tree-based Models")
) %>%
  mutate(Panel = factor(Panel, levels = c("Linear models","Non-linear Models","Tree-based Models")))

ggplot(rmse_data, aes(x = n, y = RMSE, color = model, linetype = model)) +
  geom_line(size = 0.8) +
  geom_point(size = 2) +
  facet_wrap(~ Panel) +
  labs(
    x     = "Number of additional non-informative predictors",
    y     = "RMSE",
    color = "model", linetype = "model"
  ) +
  theme(legend.position = "right",
        legend.text     = element_text(size = 8),
        legend.key.width = unit(1.5, "cm"))

# Figure 3.7 - Training time increase (reconstructed from book values)
time_data <- bind_rows(
  # Linear models
  data.frame(n=n_steps, pct=c(0,70,115,180,430,630),  model="Elastic net",              Panel="Linear models"),
  data.frame(n=n_steps, pct=c(0,25, 45, 85,185,285),  model="Lasso",                    Panel="Linear models"),
  data.frame(n=n_steps, pct=c(0,10, 40,160,270,280),  model="Linear regression",        Panel="Linear models"),
  data.frame(n=n_steps, pct=c(0,60,120,220,450,810),  model="Partial least squares",    Panel="Linear models"),
  data.frame(n=n_steps, pct=c(0,55,110,200,320,340),  model="Principal component regression", Panel="Linear models"),
  # Non-linear models
  data.frame(n=n_steps, pct=c(0, 5, 15, 30, 55, 80),  model="Multivariate adaptive regression splines", Panel="Non-linear Models"),
  data.frame(n=n_steps, pct=c(0,60,130,280,490,840),  model="Neural net",               Panel="Non-linear Models"),
  data.frame(n=n_steps, pct=c(0,95,230,465,660,840),  model="Support vector machine",   Panel="Non-linear Models"),
  # Tree-based models
  data.frame(n=n_steps, pct=c(0,15, 55, 90,135,240),  model="Gradient boosting machines", Panel="Tree-based Models"),
  data.frame(n=n_steps, pct=c(0, 5, 20, 55, 90,150),  model="Random forest",            Panel="Tree-based Models")
) %>%
  mutate(Panel = factor(Panel, levels = c("Linear models","Non-linear Models","Tree-based Models")))

ggplot(time_data, aes(x = n, y = pct, color = model, linetype = model)) +
  geom_line(size = 0.8) +
  geom_point(size = 2) +
  facet_wrap(~ Panel) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  labs(
    x     = "Number of additional non-informative predictors",
    y     = "Percent increase in training time",
    color = "model", linetype = "model"
  ) +
  theme(legend.position = "right",
        legend.text     = element_text(size = 8),
        legend.key.width = unit(1.5, "cm"))

3.5 Numeric Feature Engineering

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric())

ames_recipe %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(),  -all_outcomes())

# Figure 3.8 - match original: x1 near 0, x2 spread wide, x3 near 140-160
set.seed(7)
n <- 50

feat_df <- data.frame(
  x1 = rnorm(n,  0,  1),
  x2 = c(rnorm(40, 20, 15), runif(10, 200, 250)),   # spread with outliers like original
  x3 = rnorm(n, 145, 8)
)

std_df <- as.data.frame(scale(feat_df))

bind_rows(
  pivot_longer(feat_df, everything(), names_to = "Feature", values_to = "Value") %>%
    mutate(Scale = "Real value"),
  pivot_longer(std_df,  everything(), names_to = "Feature", values_to = "Value") %>%
    mutate(Scale = "Standardized value")
) %>%
  mutate(Scale = factor(Scale, levels = c("Real value", "Standardized value"))) %>%
  ggplot(aes(x = Value, y = Feature)) +
  geom_jitter(height = 0.07, alpha = 0.6, size = 1.8) +
  facet_wrap(~ Scale, scales = "free_x") +
  labs(x = "Value", y = "Feature")

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0      rsample_1.3.1     AmesHousing_0.0.4 broom_1.0.10     
##  [5] tidyr_1.3.1       purrr_1.1.0       recipes_1.3.1     caret_7.0-1      
##  [9] lattice_0.22-7    visdat_0.6.0      ggplot2_4.0.0     dplyr_1.1.4      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4051.111    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.6          sass_0.4.10         
## [16] tools_4.5.1          utf8_1.2.6           yaml_2.3.10         
## [19] data.table_1.17.8    knitr_1.50           labeling_0.4.3      
## [22] curl_7.0.0           TTR_0.24.4           plyr_1.8.9          
## [25] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-20         
## [28] grid_4.5.1           stats4_4.5.1         xts_0.14.1          
## [31] colorspace_2.1-2     future_1.67.0        globals_0.18.0      
## [34] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [37] rmarkdown_2.30       generics_0.1.4       rstudioapi_0.17.1   
## [40] future.apply_1.20.0  reshape2_1.4.5       cachem_1.1.0        
## [43] stringr_1.5.2        splines_4.5.1        forecast_8.24.0     
## [46] parallel_4.5.1       urca_1.3-4           vctrs_0.6.5         
## [49] hardhat_1.4.2        Matrix_1.7-3         jsonlite_2.0.0      
## [52] tseries_0.10-58      listenv_0.9.1        foreach_1.5.2       
## [55] gower_1.0.2          jquerylib_0.1.4      quantmod_0.4.28     
## [58] glue_1.8.0           parallelly_1.45.1    codetools_0.2-20    
## [61] lubridate_1.9.4      stringi_1.8.7        gtable_0.3.6        
## [64] quadprog_1.5-8       lmtest_0.9-40        tibble_3.3.0        
## [67] pillar_1.11.1        furrr_0.3.1          htmltools_0.5.8.1   
## [70] ipred_0.9-15         lava_1.8.1           R6_2.6.1            
## [73] evaluate_1.0.5       backports_1.5.0      fracdiff_1.5-3      
## [76] bslib_0.9.0          class_7.3-23         Rcpp_1.1.0          
## [79] nlme_3.1-168         prodlim_2025.04.28   xfun_0.53           
## [82] zoo_1.8-14           ModelMetrics_1.2.2.2 pkgconfig_2.0.3

Chapter 3: Feature & Target Engineering (Sections 3.1–3.5)

Khongorzul Erkhembayar

2026-03-09

3.1 Prerequisites

3.2 Target Engineering

3.3 Dealing with Missingness

3.4 Feature Filtering

3.5 Numeric Feature Engineering