Overview

This document reproduces sections 3.1 to 3.5 of the online book Hands-On Machine Learning with R by Boehmke & Greenwell. Book link

3.1 Prerequisites

We use the Ames Housing dataset — residential property sales with 80+ features. We load it via the AmesHousing package and split into training/test sets using rsample.

# Install packages if needed (only runs once)
pkgs <- c("AmesHousing", "rsample", "recipes", "dplyr", "ggplot2",
          "visdat", "naniar", "caret")
new_pkgs <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if(length(new_pkgs)) install.packages(new_pkgs, repos = "https://cran.rstudio.com/")

# Load libraries
library(AmesHousing)
library(rsample)
library(recipes)
library(dplyr)
library(ggplot2)
library(visdat)
library(naniar)

# Load the Ames Housing dataset
ames <- make_ames()

# Split into 70% training, 30% testing
set.seed(123)
split  <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test  <- testing(split)

# Preview
cat("Training set:", nrow(ames_train), "rows x", ncol(ames_train), "columns\n")

## Training set: 2049 rows x 81 columns

cat("Test set:    ", nrow(ames_test),  "rows x", ncol(ames_test),  "columns\n")

## Test set:     881 rows x 81 columns

head(ames_train[, 1:6])

## # A tibble: 6 × 6
##   MS_SubClass                  MS_Zoning      Lot_Frontage Lot_Area Street Alley
##   <fct>                        <fct>                 <dbl>    <int> <fct>  <fct>
## 1 Two_Story_PUD_1946_and_Newer Residential_M…           21     1680 Pave   No_A…
## 2 Two_Story_PUD_1946_and_Newer Residential_M…           21     1680 Pave   No_A…
## 3 One_Story_PUD_1946_and_Newer Residential_L…           24     2280 Pave   No_A…
## 4 One_Story_PUD_1946_and_Newer Residential_L…           50     7175 Pave   No_A…
## 5 One_Story_1945_and_Older     Residential_H…           70     9800 Pave   No_A…
## 6 Duplex_All_Styles_and_Ages   Residential_M…           68     8930 Pave   No_A…

3.2 Target Engineering

Sale_Price is right-skewed. We compare three options: raw, log, and Box-Cox.

library(MASS)  # for boxcox

# Check skewness of original
cat("Skewness (original):", round(e1071::skewness(ames_train$Sale_Price), 3), "\n")

## Skewness (original): 1.671

# Option 1: Log transform
ames_train$log_price <- log(ames_train$Sale_Price)
cat("Skewness (log):     ", round(e1071::skewness(ames_train$log_price), 3), "\n")

## Skewness (log):      -0.096

# Option 2: Box-Cox (find optimal lambda)
bc       <- boxcox(Sale_Price ~ 1, data = ames_train, lambda = seq(-2, 2, 0.1))

lam_opt  <- bc$x[which.max(bc$y)]
cat("Box-Cox optimal lambda:", round(lam_opt, 4), "\n")

## Box-Cox optimal lambda: 0.0606

# Plot all three distributions
par(mfrow = c(1, 3), bg = "white")
hist(ames_train$Sale_Price,     main = "Original Sale_Price",  col = "#6eb5ff", border = "white", xlab = "")
hist(ames_train$log_price,      main = "log(Sale_Price)",       col = "#7fffd4", border = "white", xlab = "")
hist((ames_train$Sale_Price^lam_opt - 1)/lam_opt,
     main = paste0("Box-Cox (λ=", round(lam_opt,2), ")"),
     col = "#e8c97a", border = "white", xlab = "")

par(mfrow = c(1,1))

Interpretation: The log transform reduces skewness dramatically. Box-Cox finds λ ≈ 0, confirming log is the right choice here.

3.3 Dealing with Missingness

3.3.1 Visualising Missing Values

# See which columns have missing data
vis_miss(ames_train, warn_large_data = FALSE) +
  labs(title = "3.3.1 Missing Value Map — Ames Training Set") +
  theme(axis.text.x = element_text(angle = 90, size = 6))

# Top 10 columns by % missing
miss_summary <- ames_train %>%
  summarise(across(everything(), ~mean(is.na(.))*100)) %>%
  tidyr::pivot_longer(everything(), names_to="Variable", values_to="Pct_Missing") %>%
  filter(Pct_Missing > 0) %>%
  arrange(desc(Pct_Missing))

print(miss_summary)

## # A tibble: 0 × 2
## # ℹ 2 variables: Variable <chr>, Pct_Missing <dbl>

3.3.2 Imputation

# Build a recipe with multiple imputation strategies
impute_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  # Estimated statistic: median for numeric
  step_impute_median(Lot_Frontage) %>%
  # KNN imputation for other numerics
  step_impute_knn(all_numeric_predictors(), neighbors = 5) %>%
  # Mode for categoricals
  step_impute_mode(all_nominal_predictors())

# Apply the recipe
imputed <- prep(impute_recipe, training = ames_train)
ames_imputed <- bake(imputed, new_data = ames_train)

cat("Missing values BEFORE imputation:", sum(is.na(ames_train)), "\n")

## Missing values BEFORE imputation: 0

cat("Missing values AFTER imputation: ", sum(is.na(ames_imputed)), "\n")

## Missing values AFTER imputation:  0

3.4 Feature Filtering — Near-Zero Variance

library(caret)

# Identify near-zero variance features
nzv_cols <- nearZeroVar(ames_train, saveMetrics = TRUE)
nzv_flag  <- nzv_cols[nzv_cols$nzv == TRUE, ]

cat("Near-zero variance features detected:", nrow(nzv_flag), "\n")

## Near-zero variance features detected: 21

print(head(nzv_flag[order(-nzv_flag$freqRatio), ], 8))

##                    freqRatio percentUnique zeroVar  nzv
## Pool_Area          2039.0000    0.53684724   FALSE TRUE
## Utilities          1023.0000    0.14641288   FALSE TRUE
## Low_Qual_Fin_SF    1010.5000    1.31771596   FALSE TRUE
## Three_season_porch  673.6667    1.12249878   FALSE TRUE
## Pool_QC             509.7500    0.24402147   FALSE TRUE
## BsmtFin_SF_2        453.2500    9.37042460   FALSE TRUE
## Street              226.6667    0.09760859   FALSE TRUE
## Condition_2         202.6000    0.34163006   FALSE TRUE

# Bar chart of frequency ratios
nzv_top <- nzv_cols %>%
  tibble::rownames_to_column("Feature") %>%
  arrange(desc(freqRatio)) %>%
  head(10)

ggplot(nzv_top, aes(x = reorder(Feature, freqRatio), y = freqRatio,
                     fill = nzv)) +
  geom_col() +
  geom_hline(yintercept = 19, linetype = "dashed", color = "red") +
  coord_flip() +
  scale_fill_manual(values = c("FALSE"="#6eb5ff","TRUE"="#ff7b7b")) +
  labs(title = "3.4 Near-Zero Variance — Top 10 Features",
       x = "", y = "Frequency Ratio", fill = "NZV flag") +
  theme_minimal()

Interpretation: Features with a frequency ratio above the red dashed line (19) and very few unique values are flagged as near-zero variance and should be dropped before modelling.

3.5 Numeric Feature Engineering

3.5.1 Skewness Correction

num_vars <- c("Gr_Liv_Area", "Total_Bsmt_SF", "Lot_Frontage",
              "Mas_Vnr_Area", "Garage_Cars")

# Compute skewness before
skew_before <- sapply(ames_train[num_vars], function(x)
  round(e1071::skewness(x, na.rm=TRUE), 3))
cat("Skewness BEFORE:\n"); print(skew_before)

## Skewness BEFORE:

##   Gr_Liv_Area Total_Bsmt_SF  Lot_Frontage  Mas_Vnr_Area   Garage_Cars 
##         1.375         1.411        -0.092         2.625        -0.218

# Apply log1p to skewed features (skew > 0.5)
skew_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_of(num_vars[skew_before > 0.5]), offset = 1)

skew_baked <- bake(prep(skew_recipe, ames_train), new_data = ames_train)
skew_after <- sapply(skew_baked[num_vars], function(x)
  round(e1071::skewness(x, na.rm=TRUE), 3))
cat("\nSkewness AFTER log1p:\n"); print(skew_after)

## 
## Skewness AFTER log1p:

##   Gr_Liv_Area Total_Bsmt_SF  Lot_Frontage  Mas_Vnr_Area   Garage_Cars 
##         0.051        -4.933        -0.092         0.528        -0.218

# Side-by-side histogram for Gr_Liv_Area
par(mfrow = c(1, 2))
hist(ames_train$Gr_Liv_Area, main="Gr_Liv_Area (before)",
     col="#ff7b7b", border="white", xlab="")
hist(log1p(ames_train$Gr_Liv_Area), main="log1p(Gr_Liv_Area) (after)",
     col="#7fffd4", border="white", xlab="")

par(mfrow = c(1,1))

3.5.2 Standardization

z_score <- function(x) (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE) min_max <- function(x) (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE))

raw <- ames_train$Gr_Liv_Area z_sc <- z_score(raw) mm_sc <- min_max(raw)

cat(“Raw - mean:”, round(mean(raw),1), ” sd:“, round(sd(raw),1),”“) cat(”Z-score - mean:“, round(mean(z_sc),4),” sd:“, round(sd(z_sc),4),”“) cat(”Min-Max - range: [“, round(min(mm_sc),3),”,“, round(max(mm_sc),3),”]“)

par(mfrow = c(1, 3)) hist(raw, main=“Original Gr_Liv_Area”, col=“#6eb5ff”, border=“white”) hist(z_sc, main=“Z-score Standardized”, col=“#e8c97a”, border=“white”) hist(mm_sc, main=“Min-Max Scaled 0 to 1”, col=“#7fffd4”, border=“white”) par(mfrow = c(1, 1)) # Z-score standardization z_score <- function(x) (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)

Min-max scaling

min_max <- function(x) (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE))

raw <- ames_train$Gr_Liv_Area z_sc <- z_score(raw) mm_sc <- min_max(raw)

cat(“Raw — mean:”, round(mean(raw),1), ” sd:“, round(sd(raw),1),”“) cat(”Z-score — mean:“, round(mean(z_sc),4),” sd:“, round(sd(z_sc),4),”“) cat(”Min-Max — range: [“, round(min(mm_sc),3),”,“, round(max(mm_sc),3),”]“)

par(mfrow = c(1, 3)) hist(raw, main=“Original Gr_Liv_Area”, col=“#6eb5ff”, border=“white”) hist(z_sc, main=“Z-score Standardized”, col=“#e8c97a”, border=“white”) hist(mm_sc, main=“Min-Max Scaled [0,1]”, col=“#7fffd4”, border=“white”) par(mfrow = c(1, 1)) `

Key takeaway: Standardization changes the scale of features, not their shape. Z-score is preferred for most ML models. Tree-based models (Random Forest, XGBoost) don’t need standardization at all.

install.packages(“prodlim”, repos = “https://cran.rstudio.com/”)

Generated with R Markdown. Reproduces HOML sections 3.1 to 3.5

HOML Chapter 3: Feature & Target Engineering (§3.1–3.5)

Egshiglen 113034072

2026-04-20