Chapter 3: Feature & Target Engineering (Sections 3.1–3.5)

Reproducing outputs from Sections 3.1 to 3.5 of Hands-On Machine Learning with R.


3.1 Prerequisites

# Helper packages
library(dplyr)    # for data manipulation
library(ggplot2)  # for awesome graphics
library(visdat)   # for additional visualizations

# Feature engineering packages
library(caret)    # for various ML tasks
library(recipes)  # for feature engineering tasks

# Additional packages
library(purrr)
library(broom)

# Set plotting theme
ggplot2::theme_set(ggplot2::theme_light())

# Load Ames housing data
ames <- AmesHousing::make_ames()

# Stratified 70/30 split on Sale_Price
set.seed(123)
split      <- rsample::initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test  <- rsample::testing(split)

dim(ames_train)
## [1] 2049   81
dim(ames_test)
## [1] 881  81
head(ames_train$Sale_Price)
## [1] 105500  88000 120000 125000  67500 112000

3.2 Target Engineering

Figure 3.1 – Comparing residuals from a non-log-transformed vs. a log-transformed model:

models <- c("Non-log transformed model residuals",
            "Log transformed model residuals")

list(
  m1 = lm(Sale_Price ~ Year_Built, data = ames_train),
  m2 = lm(log(Sale_Price) ~ Year_Built, data = ames_train)
) %>%
  map2_dfr(models, ~ broom::augment(.x) %>% mutate(model = .y)) %>%
  ggplot(aes(.resid)) +
    geom_histogram(bins = 75) +
    facet_wrap(~ model, scales = "free_x") +
    ylab(NULL) +
    xlab("Residuals")
Figure 3.1: Residuals – non-log vs. log-transformed Sale_Price

Figure 3.1: Residuals – non-log vs. log-transformed Sale_Price

Applying a log transformation to the response via recipes:

# Direct log transform
transformed_response <- log(ames_train$Sale_Price)
head(transformed_response)
## [1] 11.56647 11.38509 11.69525 11.73607 11.11988 11.62625
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())
ames_recipe

Demonstrating that log() of a negative number produces NaN:

log(-0.5)
## [1] NaN

Box-Cox transformation via recipes:

ames_recipe_bc <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_BoxCox(all_outcomes())
ames_recipe_bc

3.3 Dealing with Missingness

3.3.1 Visualizing Missing Values

ames_raw <- AmesHousing::ames_raw

ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill = value)) +
    geom_raster() +
    coord_flip() +
    scale_y_continuous(NULL, expand = c(0, 0)) +
    scale_fill_grey(name = "",
                    labels = c("Present", "Missing")) +
    xlab("Observation") +
    theme(axis.text.y = element_text(size = 4))
Figure 3.2: Missing value map for raw Ames data

Figure 3.2: Missing value map for raw Ames data

visdat::vis_miss(ames_raw, cluster = TRUE)
Figure 3.3: vis_miss cluster plot

Figure 3.3: vis_miss cluster plot

3.3.2 Imputation

3.3.2.1 Estimated Statistic (Median Imputation)

ames_recipe_med <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(Gr_Liv_Area)
ames_recipe_med

3.3.2.2 K-Nearest Neighbor Imputation

ames_recipe_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)
ames_recipe_knn

3.3.2.3 Tree-Based (Bagged Trees) Imputation

ames_recipe_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_bag(all_predictors())
ames_recipe_bag

3.4 Feature Filtering

# Identify near-zero variance features with caret
nzv_info <- caret::nearZeroVar(ames_train, saveMetrics = TRUE)
cat("Features flagged as near-zero variance:\n")
## Features flagged as near-zero variance:
print(nzv_info[nzv_info$nzv == TRUE, ])
##                     freqRatio percentUnique zeroVar  nzv
## Street              226.66667    0.09760859   FALSE TRUE
## Alley                24.25316    0.14641288   FALSE TRUE
## Land_Contour         19.50000    0.19521718   FALSE TRUE
## Utilities          1023.00000    0.14641288   FALSE TRUE
## Land_Slope           22.15909    0.14641288   FALSE TRUE
## Condition_2         202.60000    0.34163006   FALSE TRUE
## Roof_Matl           144.35714    0.39043436   FALSE TRUE
## Bsmt_Cond            20.24444    0.29282577   FALSE TRUE
## BsmtFin_Type_2       25.85294    0.34163006   FALSE TRUE
## BsmtFin_SF_2        453.25000    9.37042460   FALSE TRUE
## Heating             106.00000    0.29282577   FALSE TRUE
## Low_Qual_Fin_SF    1010.50000    1.31771596   FALSE TRUE
## Kitchen_AbvGr        21.23913    0.19521718   FALSE TRUE
## Functional           38.89796    0.39043436   FALSE TRUE
## Enclosed_Porch      102.05882    7.41825281   FALSE TRUE
## Three_season_porch  673.66667    1.12249878   FALSE TRUE
## Screen_Porch        169.90909    4.63640800   FALSE TRUE
## Pool_Area          2039.00000    0.53684724   FALSE TRUE
## Pool_QC             509.75000    0.24402147   FALSE TRUE
## Misc_Feature         34.18966    0.24402147   FALSE TRUE
## Misc_Val            180.54545    1.56173743   FALSE TRUE
# Remove near-zero variance predictors via recipes
ames_recipe_nzv <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors())
ames_recipe_nzv

3.5 Numeric Feature Engineering

3.5.1 Skewness

# Compute skewness for all numeric features
skewed_features <- ames_train %>%
  select(where(is.numeric)) %>%
  summarise(across(everything(),
                   ~ e1071::skewness(., na.rm = TRUE))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "feature",
                      values_to = "skewness") %>%
  arrange(desc(abs(skewness)))

cat("Top 10 most skewed features:\n")
## Top 10 most skewed features:
print(head(skewed_features, 10))
## # A tibble: 10 × 2
##    feature            skewness
##    <chr>                 <dbl>
##  1 Misc_Val              20.7 
##  2 Pool_Area             16.7 
##  3 Lot_Area              13.0 
##  4 Three_season_porch    11.3 
##  5 Low_Qual_Fin_SF       10.6 
##  6 Kitchen_AbvGr          4.45
##  7 Enclosed_Porch         4.40
##  8 BsmtFin_SF_2           4.16
##  9 Bsmt_Half_Bath         4.00
## 10 Screen_Porch           3.75
p1 <- ggplot(ames_train, aes(Gr_Liv_Area)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "white") +
  ggtitle("Original: Gr_Liv_Area") +
  xlab("Above grade living area (sq ft)")

p2 <- ggplot(ames_train, aes(log(Gr_Liv_Area))) +
  geom_histogram(bins = 50, fill = "darkorange", color = "white") +
  ggtitle("Log-transformed: Gr_Liv_Area") +
  xlab("log(Gr_Liv_Area)")

gridExtra::grid.arrange(p1, p2, ncol = 2)
Figure 3.4: Gr_Liv_Area before and after log transformation

Figure 3.4: Gr_Liv_Area before and after log transformation

Apply Yeo-Johnson transformation to all numeric predictors:

ames_recipe_yj <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors())
ames_recipe_yj

3.5.2 Standardization

p3 <- ggplot(ames_train, aes(Year_Built)) +
  geom_histogram(bins = 40, fill = "steelblue", color = "white") +
  ggtitle("Before Standardization: Year_Built")

# Build, prep, and bake the standardization recipe
ames_recipe_std <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())

ames_std_prep  <- prep(ames_recipe_std, training = ames_train)
ames_std_baked <- bake(ames_std_prep, new_data = ames_train)

p4 <- ggplot(ames_std_baked, aes(Year_Built)) +
  geom_histogram(bins = 40, fill = "darkorange", color = "white") +
  ggtitle("After Standardization: Year_Built")

gridExtra::grid.arrange(p3, p4, ncol = 2)
Figure 3.5: Year_Built before and after standardization

Figure 3.5: Year_Built before and after standardization

cat("Summary of Year_Built after standardization:\n")
## Summary of Year_Built after standardization:
summary(ames_std_baked$Year_Built)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.29356 -0.61093  0.05145  0.00000  0.97878  1.27685

End of Homework 2 – Sections 3.1 to 3.5 of HOML Chapter 3.