Overview

This document reproduces sections 3.1 to 3.5 of Hands-On Machine Learning with R. Book link: https://bradleyboehmke.github.io/HOML/engineering.html


3.1 Prerequisites

install.packages(c("AmesHousing","rsample","recipes","dplyr","ggplot2",
                   "visdat","naniar","caret","e1071","tidyr","prodlim"),
                 repos = "https://cran.rstudio.com/", quiet = TRUE)
## package 'AmesHousing' successfully unpacked and MD5 sums checked
## package 'rsample' successfully unpacked and MD5 sums checked
## package 'recipes' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'visdat' successfully unpacked and MD5 sums checked
## package 'naniar' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## package 'e1071' successfully unpacked and MD5 sums checked
## package 'tidyr' successfully unpacked and MD5 sums checked
## package 'prodlim' successfully unpacked and MD5 sums checked
library(AmesHousing)
library(rsample)
library(recipes)
library(dplyr)
library(ggplot2)
library(visdat)
library(naniar)
library(caret)
library(e1071)

ames <- make_ames()

set.seed(123)
split      <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test  <- testing(split)

cat("Training set:", nrow(ames_train), "rows x", ncol(ames_train), "columns\n")
## Training set: 2049 rows x 81 columns
cat("Test set:    ", nrow(ames_test),  "rows x", ncol(ames_test),  "columns\n")
## Test set:     881 rows x 81 columns
head(ames_train[, 1:6])
## # A tibble: 6 × 6
##   MS_SubClass                  MS_Zoning      Lot_Frontage Lot_Area Street Alley
##   <fct>                        <fct>                 <dbl>    <int> <fct>  <fct>
## 1 Two_Story_PUD_1946_and_Newer Residential_M…           21     1680 Pave   No_A…
## 2 Two_Story_PUD_1946_and_Newer Residential_M…           21     1680 Pave   No_A…
## 3 One_Story_PUD_1946_and_Newer Residential_L…           24     2280 Pave   No_A…
## 4 One_Story_PUD_1946_and_Newer Residential_L…           50     7175 Pave   No_A…
## 5 One_Story_1945_and_Older     Residential_H…           70     9800 Pave   No_A…
## 6 Duplex_All_Styles_and_Ages   Residential_M…           68     8930 Pave   No_A…

3.2 Target Engineering

library(MASS)

cat("Skewness (original):", round(skewness(ames_train$Sale_Price), 3), "\n")
## Skewness (original): 1.671
log_price <- log(ames_train$Sale_Price)
cat("Skewness (log):     ", round(skewness(log_price), 3), "\n")
## Skewness (log):      -0.096
bc      <- boxcox(Sale_Price ~ 1, data = ames_train, lambda = seq(-2, 2, 0.1))

lam_opt <- bc$x[which.max(bc$y)]
cat("Box-Cox optimal lambda:", round(lam_opt, 4), "\n")
## Box-Cox optimal lambda: 0.0606
par(mfrow = c(1, 3), bg = "white")
hist(ames_train$Sale_Price,
     main = "Original Sale_Price", col = "#6eb5ff", border = "white", xlab = "")
hist(log_price,
     main = "log(Sale_Price)", col = "#7fffd4", border = "white", xlab = "")
hist((ames_train$Sale_Price^lam_opt - 1) / lam_opt,
     main = paste0("Box-Cox lambda=", round(lam_opt, 2)),
     col = "#e8c97a", border = "white", xlab = "")

par(mfrow = c(1, 1))

3.3 Dealing with Missingness

3.3.1 Visualising Missing Values

vis_miss(ames_train, warn_large_data = FALSE)

miss_summary <- data.frame(
  Variable    = names(ames_train),
  Pct_Missing = sapply(ames_train, function(x) mean(is.na(x)) * 100)
)
miss_summary <- miss_summary[miss_summary$Pct_Missing > 0, ]
miss_summary <- miss_summary[order(-miss_summary$Pct_Missing), ]
print(miss_summary)
## [1] Variable    Pct_Missing
## <0 rows> (or 0-length row.names)

3.3.2 Imputation

impute_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(Lot_Frontage) %>%
  step_impute_knn(all_numeric_predictors(), neighbors = 5) %>%
  step_impute_mode(all_nominal_predictors())

imputed      <- prep(impute_recipe, training = ames_train)
ames_imputed <- bake(imputed, new_data = ames_train)

cat("Missing values BEFORE:", sum(is.na(ames_train)), "\n")
## Missing values BEFORE: 0
cat("Missing values AFTER: ", sum(is.na(ames_imputed)), "\n")
## Missing values AFTER:  0

3.4 Feature Filtering - Near-Zero Variance

nzv_cols <- nearZeroVar(ames_train, saveMetrics = TRUE)
nzv_flag <- nzv_cols[nzv_cols$nzv == TRUE, ]

cat("Near-zero variance features:", nrow(nzv_flag), "\n")
## Near-zero variance features: 21
print(head(nzv_flag[order(-nzv_flag$freqRatio), ], 8))
##                    freqRatio percentUnique zeroVar  nzv
## Pool_Area          2039.0000    0.53684724   FALSE TRUE
## Utilities          1023.0000    0.14641288   FALSE TRUE
## Low_Qual_Fin_SF    1010.5000    1.31771596   FALSE TRUE
## Three_season_porch  673.6667    1.12249878   FALSE TRUE
## Pool_QC             509.7500    0.24402147   FALSE TRUE
## BsmtFin_SF_2        453.2500    9.37042460   FALSE TRUE
## Street              226.6667    0.09760859   FALSE TRUE
## Condition_2         202.6000    0.34163006   FALSE TRUE
nzv_top <- data.frame(
  Feature   = rownames(nzv_cols),
  FreqRatio = nzv_cols$freqRatio,
  NZV       = nzv_cols$nzv
)
nzv_top <- head(nzv_top[order(-nzv_top$FreqRatio), ], 10)

ggplot(nzv_top, aes(x = reorder(Feature, FreqRatio), y = FreqRatio, fill = NZV)) +
  geom_col() +
  geom_hline(yintercept = 19, linetype = "dashed", color = "red") +
  coord_flip() +
  scale_fill_manual(values = c("FALSE" = "#6eb5ff", "TRUE" = "#ff7b7b")) +
  labs(title = "3.4 Near-Zero Variance - Top 10 Features",
       x = "", y = "Frequency Ratio", fill = "NZV Flag") +
  theme_minimal()


3.5 Numeric Feature Engineering

3.5.1 Skewness

num_vars <- c("Gr_Liv_Area", "Total_Bsmt_SF", "Lot_Frontage",
              "Mas_Vnr_Area", "Garage_Cars")

skew_before <- sapply(ames_train[num_vars],
                      function(x) round(skewness(x, na.rm = TRUE), 3))
cat("Skewness BEFORE:\n")
## Skewness BEFORE:
print(skew_before)
##   Gr_Liv_Area Total_Bsmt_SF  Lot_Frontage  Mas_Vnr_Area   Garage_Cars 
##         1.375         1.411        -0.092         2.625        -0.218
skew_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(Gr_Liv_Area, Total_Bsmt_SF, Mas_Vnr_Area, offset = 1)

skew_baked <- bake(prep(skew_recipe, ames_train), new_data = ames_train)

skew_after <- sapply(skew_baked[num_vars],
                     function(x) round(skewness(x, na.rm = TRUE), 3))
cat("\nSkewness AFTER log1p:\n")
## 
## Skewness AFTER log1p:
print(skew_after)
##   Gr_Liv_Area Total_Bsmt_SF  Lot_Frontage  Mas_Vnr_Area   Garage_Cars 
##         0.051        -4.933        -0.092         0.528        -0.218
par(mfrow = c(1, 2))
hist(ames_train$Gr_Liv_Area,
     main = "Gr_Liv_Area (before)", col = "#ff7b7b", border = "white", xlab = "")
hist(log1p(ames_train$Gr_Liv_Area),
     main = "log1p(Gr_Liv_Area) (after)", col = "#7fffd4", border = "white", xlab = "")

par(mfrow = c(1, 1))

3.5.2 Standardization

z_score <- function(x) (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
min_max <- function(x) (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))

raw   <- ames_train$Gr_Liv_Area
z_sc  <- z_score(raw)
mm_sc <- min_max(raw)

cat("Raw     - mean:", round(mean(raw), 1), "  sd:", round(sd(raw), 1), "\n")
## Raw     - mean: 1504.4   sd: 518.4
cat("Z-score - mean:", round(mean(z_sc), 4), "  sd:", round(sd(z_sc), 4), "\n")
## Z-score - mean: 0   sd: 1
cat("Min-Max - min:", round(min(mm_sc), 3), " max:", round(max(mm_sc), 3), "\n")
## Min-Max - min: 0  max: 1
par(mfrow = c(1, 3))
hist(raw,   main = "Original Gr_Liv_Area",  col = "#6eb5ff", border = "white", xlab = "")
hist(z_sc,  main = "Z-score Standardized",  col = "#e8c97a", border = "white", xlab = "")
hist(mm_sc, main = "Min-Max Scaled 0 to 1", col = "#7fffd4", border = "white", xlab = "")

par(mfrow = c(1, 1))

Generated with R Markdown. Reproduces HOML sections 3.1 to 3.5