Chapter 3: Feature & Target Engineering

“Live with your data before you plunge into modeling.”
— Leo Breiman

This report reproduces the outputs from Sections 3.1 – 3.5 of
Hands-On Machine Learning with R.


3.1 Prerequisites

library(ggplot2)
library(dplyr)
library(recipes)
library(rsample)
library(modeldata)
library(e1071)
library(tidyr)
library(gridExtra)
library(scales)

select <- dplyr::select

# Load Ames Housing dataset
data(ames)
cat("Dataset dimensions:", nrow(ames), "rows x", ncol(ames), "columns\n")
## Dataset dimensions: 2930 rows x 74 columns
# 70/30 stratified train/test split
set.seed(123)
split      <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test  <- testing(split)

cat("Training set:", nrow(ames_train), "rows\n")
## Training set: 2049 rows
cat("Test set:    ", nrow(ames_test),  "rows\n")
## Test set:     881 rows
summary(ames_train$Sale_Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12789  129500  160000  180923  213500  755000

3.2 Target Engineering

Skewed target variables can hurt model performance. We compare three approaches: original, log(), and log10() via recipes.

skew_orig  <- round(skewness(ames_train$Sale_Price), 3)
skew_log   <- round(skewness(log(ames_train$Sale_Price)), 3)

rec_log    <- recipe(Sale_Price ~ ., data = ames_train) %>%
                step_log(Sale_Price, base = 10)
ames_log   <- bake(prep(rec_log, ames_train), new_data = ames_train)
skew_log10 <- round(skewness(ames_log$Sale_Price), 3)

cat("Skewness — Original :", skew_orig,  "\n")
## Skewness — Original : 1.671
cat("Skewness — log()    :", skew_log,   "\n")
## Skewness — log()    : -0.096
cat("Skewness — log10()  :", skew_log10, "\n")
## Skewness — log10()  : -0.096
p1 <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "white") +
  scale_x_continuous(labels = dollar_format()) +
  labs(title = paste0("(a) Original\nskewness = ", skew_orig),
       x = "Sale Price", y = "Count") +
  theme_bw()

p2 <- ggplot(ames_train, aes(x = log(Sale_Price))) +
  geom_histogram(bins = 50, fill = "salmon", color = "white") +
  labs(title = paste0("(b) log() Transformed\nskewness = ", skew_log),
       x = "log(Sale Price)", y = "Count") +
  theme_bw()

p3 <- ggplot(ames_log, aes(x = Sale_Price)) +
  geom_histogram(bins = 50, fill = "mediumseagreen", color = "white") +
  labs(title = paste0("(c) log10 via recipes\nskewness = ", skew_log10),
       x = "log10(Sale Price)", y = "Count") +
  theme_bw()

grid.arrange(p1, p2, p3, ncol = 3)

Result: Log transformation reduces skewness from 1.671 to -0.096, making the target distribution much more symmetric.


3.3 Dealing with Missingness

3.3.1 Visualizing Missing Values

# modeldata::ames has no NAs — introducing artificial missingness for demonstration
set.seed(42)
ames_miss <- ames_train
ames_miss$Lot_Frontage[sample(nrow(ames_miss), 330)] <- NA
ames_miss$Garage_Cars[sample(nrow(ames_miss), 150)]  <- NA
ames_miss$Mas_Vnr_Area[sample(nrow(ames_miss), 80)]  <- NA
missing_count <- colSums(is.na(ames_miss))
missing_df <- data.frame(
  feature     = names(missing_count),
  n_missing   = as.integer(missing_count),
  pct_missing = round(missing_count / nrow(ames_miss) * 100, 1)
) %>%
  filter(n_missing > 0) %>%
  arrange(desc(pct_missing))

missing_df
##                   feature n_missing pct_missing
## Lot_Frontage Lot_Frontage       330        16.1
## Garage_Cars   Garage_Cars       150         7.3
## Mas_Vnr_Area Mas_Vnr_Area        80         3.9
ggplot(missing_df, aes(x = reorder(feature, pct_missing), y = pct_missing)) +
  geom_col(fill = "tomato") +
  coord_flip() +
  labs(title = "Missing Values per Feature",
       x = "Feature", y = "% Missing") +
  theme_bw()

samp     <- ames_miss[sample(nrow(ames_miss), 300), missing_df$feature]
miss_mat <- as.data.frame(is.na(samp)) %>%
  mutate(row_id = row_number()) %>%
  pivot_longer(-row_id, names_to = "feature", values_to = "missing")

ggplot(miss_mat, aes(x = row_id, y = feature, fill = missing)) +
  geom_tile() +
  scale_fill_manual(values = c("FALSE" = "#d4e6f1", "TRUE" = "#e74c3c"),
                    labels = c("Observed", "Missing")) +
  labs(title = "Missingness Pattern (300 row sample)",
       x = "Row (sample)", y = "Feature", fill = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

3.3.2 Imputation

We compare three imputation strategies on Lot_Frontage:

# Mean imputation
rec_mean      <- recipe(Sale_Price ~ ., data = ames_miss) %>%
                   step_impute_mean(Lot_Frontage, Garage_Cars, Mas_Vnr_Area)
ames_mean_imp <- bake(prep(rec_mean, ames_miss), new_data = ames_miss)

# KNN imputation
rec_knn       <- recipe(Sale_Price ~ ., data = ames_miss) %>%
                   step_impute_knn(Lot_Frontage, Garage_Cars, Mas_Vnr_Area, neighbors = 6)
ames_knn_imp  <- bake(prep(rec_knn, ames_miss), new_data = ames_miss)

# Bagged tree imputation
rec_bag       <- recipe(Sale_Price ~ ., data = ames_miss) %>%
                   step_impute_bag(Lot_Frontage, Garage_Cars, Mas_Vnr_Area)
ames_bag_imp  <- bake(prep(rec_bag, ames_miss), new_data = ames_miss)

cat("NAs remaining after mean imputation:   ", sum(is.na(ames_mean_imp$Lot_Frontage)), "\n")
## NAs remaining after mean imputation:    0
cat("NAs remaining after KNN imputation:    ", sum(is.na(ames_knn_imp$Lot_Frontage)),  "\n")
## NAs remaining after KNN imputation:     0
cat("NAs remaining after bagged imputation: ", sum(is.na(ames_bag_imp$Lot_Frontage)),  "\n")
## NAs remaining after bagged imputation:  0
imp_df <- bind_rows(
  data.frame(value  = ames_miss$Lot_Frontage[!is.na(ames_miss$Lot_Frontage)],
             method = "Original (no NA)"),
  data.frame(value  = ames_mean_imp$Lot_Frontage, method = "Mean Imputed"),
  data.frame(value  = ames_knn_imp$Lot_Frontage,  method = "KNN Imputed"),
  data.frame(value  = ames_bag_imp$Lot_Frontage,  method = "Bagged Tree Imputed")
) %>%
  mutate(method = factor(method,
    levels = c("Original (no NA)", "Mean Imputed", "KNN Imputed", "Bagged Tree Imputed")))

ggplot(imp_df, aes(x = value, fill = method)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85) +
  facet_wrap(~method, ncol = 2) +
  labs(title = "Imputation Methods Comparison: Lot_Frontage",
       x = "Lot Frontage", y = "Count") +
  theme_bw() +
  theme(legend.position = "none")

Note: Mean imputation creates a spike at the mean value (visible in the histogram), while KNN and Bagged Tree methods preserve the original distribution shape better.


3.4 Feature Filtering

Near-zero variance (NZV) features add noise and increase computation. We use step_nzv().

rec_nzv  <- recipe(Sale_Price ~ ., data = ames_train) %>%
              step_nzv(all_predictors())
ames_nzv <- bake(prep(rec_nzv, ames_train), new_data = ames_train)

removed_nzv <- setdiff(names(ames_train), names(ames_nzv))

cat("Features before NZV filter:", ncol(ames_train) - 1, "\n")
## Features before NZV filter: 73
cat("Features after  NZV filter:", ncol(ames_nzv)  - 1, "\n")
## Features after  NZV filter: 53
cat("Removed features:\n")
## Removed features:
print(removed_nzv)
##  [1] "Street"             "Alley"              "Land_Contour"      
##  [4] "Utilities"          "Land_Slope"         "Condition_2"       
##  [7] "Roof_Matl"          "Bsmt_Cond"          "BsmtFin_Type_2"    
## [10] "BsmtFin_SF_2"       "Heating"            "Kitchen_AbvGr"     
## [13] "Functional"         "Enclosed_Porch"     "Three_season_porch"
## [16] "Screen_Porch"       "Pool_Area"          "Pool_QC"           
## [19] "Misc_Feature"       "Misc_Val"
factor_cols <- names(ames_train)[sapply(ames_train, is.factor)]
freq_ratios <- sapply(factor_cols, function(col) {
  tbl <- sort(table(ames_train[[col]]), decreasing = TRUE)
  if (length(tbl) < 2) return(Inf)
  as.numeric(tbl[1]) / as.numeric(tbl[2])
})

fr_df <- data.frame(feature = names(freq_ratios), freq_ratio = freq_ratios) %>%
  arrange(desc(freq_ratio)) %>%
  head(20)

ggplot(fr_df, aes(x = reorder(feature, freq_ratio), y = freq_ratio)) +
  geom_col(fill = "steelblue") +
  geom_hline(yintercept = 19, color = "red", linetype = "dashed", linewidth = 1) +
  coord_flip() +
  labs(title = "Near-Zero Variance: Frequency Ratio per Feature\n(Red dashed = default NZV threshold = 19)",
       x = "Feature", y = "Frequency Ratio") +
  theme_bw()

Result: 20 features were removed as near-zero variance predictors, reducing the feature space from 73 to 53.


3.5 Numeric Feature Engineering

3.5.1 Skewness

Many numeric predictors are right-skewed. We apply log and Box-Cox transformations.

num_cols  <- names(ames_train)[sapply(ames_train, is.numeric)]
num_cols  <- setdiff(num_cols, "Sale_Price")
skew_vals <- sapply(num_cols, function(col) skewness(ames_train[[col]], na.rm = TRUE))
skew_df   <- data.frame(feature = num_cols, skewness = round(skew_vals, 3)) %>%
               arrange(desc(abs(skewness)))

cat("Top 10 most skewed numeric features:\n")
## Top 10 most skewed numeric features:
head(skew_df, 10)
##                               feature skewness
## Misc_Val                     Misc_Val   20.724
## Pool_Area                   Pool_Area   16.664
## Lot_Area                     Lot_Area   12.976
## Three_season_porch Three_season_porch   11.268
## Kitchen_AbvGr           Kitchen_AbvGr    4.448
## Enclosed_Porch         Enclosed_Porch    4.400
## BsmtFin_SF_2             BsmtFin_SF_2    4.157
## Bsmt_Half_Bath         Bsmt_Half_Bath    4.002
## Screen_Porch             Screen_Porch    3.754
## Open_Porch_SF           Open_Porch_SF    2.713
orig_skew <- round(skewness(ames_train$Lot_Area), 2)
log_skew  <- round(skewness(log(ames_train$Lot_Area)), 2)

rec_bc  <- recipe(Sale_Price ~ ., data = ames_train) %>%
             step_impute_median(all_numeric_predictors()) %>%
             step_BoxCox(Lot_Area)
ames_bc <- bake(prep(rec_bc, ames_train), new_data = ames_train)
bc_skew <- round(skewness(ames_bc$Lot_Area), 2)

p_s1 <- ggplot(ames_train, aes(x = Lot_Area)) +
  geom_histogram(bins = 40, fill = "steelblue", color = "white") +
  labs(title = paste0("(a) Original\nskewness = ", orig_skew),
       x = "Lot Area", y = "Count") + theme_bw()

p_s2 <- ggplot(ames_train, aes(x = log(Lot_Area))) +
  geom_histogram(bins = 40, fill = "salmon", color = "white") +
  labs(title = paste0("(b) Log Transformed\nskewness = ", log_skew),
       x = "log(Lot Area)", y = "Count") + theme_bw()

p_s3 <- ggplot(ames_bc, aes(x = Lot_Area)) +
  geom_histogram(bins = 40, fill = "mediumseagreen", color = "white") +
  labs(title = paste0("(c) Box-Cox Transformed\nskewness = ", bc_skew),
       x = "Box-Cox(Lot Area)", y = "Count") + theme_bw()

grid.arrange(p_s1, p_s2, p_s3, ncol = 3)

Result: Box-Cox reduces Lot_Area skewness from 12.980.09, nearly achieving a normal distribution.

3.5.2 Standardization

Features on different scales can bias distance-based algorithms. We use step_center() + step_scale().

demo_feats <- c("Year_Built", "Gr_Liv_Area", "Lot_Area")

cat("--- Before standardization ---\n")
## --- Before standardization ---
ames_train %>%
  select(all_of(demo_feats)) %>%
  summarise(across(everything(),
    list(mean = ~round(mean(.x, na.rm=TRUE), 1),
         sd   = ~round(sd(.x,   na.rm=TRUE), 1))))
## # A tibble: 1 × 6
##   Year_Built_mean Year_Built_sd Gr_Liv_Area_mean Gr_Liv_Area_sd Lot_Area_mean
##             <dbl>         <dbl>            <dbl>          <dbl>         <dbl>
## 1           1971.          30.2            1504.           518.        10163.
## # ℹ 1 more variable: Lot_Area_sd <dbl>
rec_std  <- recipe(Sale_Price ~ ., data = ames_train) %>%
              step_impute_median(all_numeric_predictors()) %>%
              step_center(all_numeric_predictors()) %>%
              step_scale(all_numeric_predictors())
ames_std <- bake(prep(rec_std, ames_train), new_data = ames_train)

cat("--- After standardization (mean ~ 0, SD ~ 1) ---\n")
## --- After standardization (mean ~ 0, SD ~ 1) ---
ames_std %>%
  select(all_of(demo_feats)) %>%
  summarise(across(everything(),
    list(mean = ~round(mean(.x, na.rm=TRUE), 4),
         sd   = ~round(sd(.x,   na.rm=TRUE), 4))))
## # A tibble: 1 × 6
##   Year_Built_mean Year_Built_sd Gr_Liv_Area_mean Gr_Liv_Area_sd Lot_Area_mean
##             <dbl>         <dbl>            <dbl>          <dbl>         <dbl>
## 1               0             1                0              1             0
## # ℹ 1 more variable: Lot_Area_sd <dbl>
before_long <- ames_train %>%
  select(all_of(demo_feats)) %>%
  pivot_longer(everything(), names_to = "feature", values_to = "value") %>%
  mutate(type = "Original")

after_long <- ames_std %>%
  select(all_of(demo_feats)) %>%
  pivot_longer(everything(), names_to = "feature", values_to = "value") %>%
  mutate(type = "Standardized")

both_long <- bind_rows(before_long, after_long) %>%
  mutate(type = factor(type, levels = c("Original", "Standardized")))

ggplot(both_long, aes(x = value, fill = type)) +
  geom_histogram(bins = 35, color = "white", alpha = 0.85) +
  facet_grid(type ~ feature, scales = "free") +
  scale_fill_manual(values = c("Original" = "steelblue",
                                "Standardized" = "darkorange")) +
  labs(title = "Standardization: Before vs After",
       x = "Value", y = "Count") +
  theme_bw() +
  theme(legend.position = "none", strip.text = element_text(size = 9))

Result: After standardization, all features have mean = 0 and SD = 1, making them comparable regardless of original scale.


Summary

Section Topic Key Technique
3.1 Prerequisites rsample::initial_split()
3.2 Target Engineering step_log(), skewness reduction
3.3 Missingness step_impute_mean/knn/bag()
3.4 Feature Filtering step_nzv() — removed 20 features
3.5 Numeric Engineering step_BoxCox(), step_center(), step_scale()

Rendered with R 4.4.1 · Source: HOML Chapter 3