This document reproduces sections 3.1 to 3.5 of the online book Hands-On Machine Learning with R by Boehmke & Greenwell. Book link
We use the Ames Housing dataset — residential
property sales with 80+ features. We load it via the
AmesHousing package and split into training/test sets using
rsample.
# Install packages if needed (only runs once)
pkgs <- c("AmesHousing", "rsample", "recipes", "dplyr", "ggplot2",
"visdat", "naniar", "caret")
new_pkgs <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if(length(new_pkgs)) install.packages(new_pkgs, repos = "https://cran.rstudio.com/")
# Load libraries
library(AmesHousing)
library(rsample)
library(recipes)
library(dplyr)
library(ggplot2)
library(visdat)
library(naniar)
# Load the Ames Housing dataset
ames <- make_ames()
# Split into 70% training, 30% testing
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# Preview
cat("Training set:", nrow(ames_train), "rows x", ncol(ames_train), "columns\n")
## Training set: 2049 rows x 81 columns
cat("Test set: ", nrow(ames_test), "rows x", ncol(ames_test), "columns\n")
## Test set: 881 rows x 81 columns
head(ames_train[, 1:6])
## # A tibble: 6 × 6
## MS_SubClass MS_Zoning Lot_Frontage Lot_Area Street Alley
## <fct> <fct> <dbl> <int> <fct> <fct>
## 1 Two_Story_PUD_1946_and_Newer Residential_M… 21 1680 Pave No_A…
## 2 Two_Story_PUD_1946_and_Newer Residential_M… 21 1680 Pave No_A…
## 3 One_Story_PUD_1946_and_Newer Residential_L… 24 2280 Pave No_A…
## 4 One_Story_PUD_1946_and_Newer Residential_L… 50 7175 Pave No_A…
## 5 One_Story_1945_and_Older Residential_H… 70 9800 Pave No_A…
## 6 Duplex_All_Styles_and_Ages Residential_M… 68 8930 Pave No_A…
Sale_Price is right-skewed. We compare three options:
raw, log, and Box-Cox.
library(MASS) # for boxcox
# Check skewness of original
cat("Skewness (original):", round(e1071::skewness(ames_train$Sale_Price), 3), "\n")
## Skewness (original): 1.671
# Option 1: Log transform
ames_train$log_price <- log(ames_train$Sale_Price)
cat("Skewness (log): ", round(e1071::skewness(ames_train$log_price), 3), "\n")
## Skewness (log): -0.096
# Option 2: Box-Cox (find optimal lambda)
bc <- boxcox(Sale_Price ~ 1, data = ames_train, lambda = seq(-2, 2, 0.1))
lam_opt <- bc$x[which.max(bc$y)]
cat("Box-Cox optimal lambda:", round(lam_opt, 4), "\n")
## Box-Cox optimal lambda: 0.0606
# Plot all three distributions
par(mfrow = c(1, 3), bg = "white")
hist(ames_train$Sale_Price, main = "Original Sale_Price", col = "#6eb5ff", border = "white", xlab = "")
hist(ames_train$log_price, main = "log(Sale_Price)", col = "#7fffd4", border = "white", xlab = "")
hist((ames_train$Sale_Price^lam_opt - 1)/lam_opt,
main = paste0("Box-Cox (λ=", round(lam_opt,2), ")"),
col = "#e8c97a", border = "white", xlab = "")
par(mfrow = c(1,1))
Interpretation: The log transform reduces skewness dramatically. Box-Cox finds λ ≈ 0, confirming log is the right choice here.
# See which columns have missing data
vis_miss(ames_train, warn_large_data = FALSE) +
labs(title = "3.3.1 Missing Value Map — Ames Training Set") +
theme(axis.text.x = element_text(angle = 90, size = 6))
# Top 10 columns by % missing
miss_summary <- ames_train %>%
summarise(across(everything(), ~mean(is.na(.))*100)) %>%
tidyr::pivot_longer(everything(), names_to="Variable", values_to="Pct_Missing") %>%
filter(Pct_Missing > 0) %>%
arrange(desc(Pct_Missing))
print(miss_summary)
## # A tibble: 0 × 2
## # ℹ 2 variables: Variable <chr>, Pct_Missing <dbl>
# Build a recipe with multiple imputation strategies
impute_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
# Estimated statistic: median for numeric
step_impute_median(Lot_Frontage) %>%
# KNN imputation for other numerics
step_impute_knn(all_numeric_predictors(), neighbors = 5) %>%
# Mode for categoricals
step_impute_mode(all_nominal_predictors())
# Apply the recipe
imputed <- prep(impute_recipe, training = ames_train)
ames_imputed <- bake(imputed, new_data = ames_train)
cat("Missing values BEFORE imputation:", sum(is.na(ames_train)), "\n")
## Missing values BEFORE imputation: 0
cat("Missing values AFTER imputation: ", sum(is.na(ames_imputed)), "\n")
## Missing values AFTER imputation: 0
library(caret)
# Identify near-zero variance features
nzv_cols <- nearZeroVar(ames_train, saveMetrics = TRUE)
nzv_flag <- nzv_cols[nzv_cols$nzv == TRUE, ]
cat("Near-zero variance features detected:", nrow(nzv_flag), "\n")
## Near-zero variance features detected: 21
print(head(nzv_flag[order(-nzv_flag$freqRatio), ], 8))
## freqRatio percentUnique zeroVar nzv
## Pool_Area 2039.0000 0.53684724 FALSE TRUE
## Utilities 1023.0000 0.14641288 FALSE TRUE
## Low_Qual_Fin_SF 1010.5000 1.31771596 FALSE TRUE
## Three_season_porch 673.6667 1.12249878 FALSE TRUE
## Pool_QC 509.7500 0.24402147 FALSE TRUE
## BsmtFin_SF_2 453.2500 9.37042460 FALSE TRUE
## Street 226.6667 0.09760859 FALSE TRUE
## Condition_2 202.6000 0.34163006 FALSE TRUE
# Bar chart of frequency ratios
nzv_top <- nzv_cols %>%
tibble::rownames_to_column("Feature") %>%
arrange(desc(freqRatio)) %>%
head(10)
ggplot(nzv_top, aes(x = reorder(Feature, freqRatio), y = freqRatio,
fill = nzv)) +
geom_col() +
geom_hline(yintercept = 19, linetype = "dashed", color = "red") +
coord_flip() +
scale_fill_manual(values = c("FALSE"="#6eb5ff","TRUE"="#ff7b7b")) +
labs(title = "3.4 Near-Zero Variance — Top 10 Features",
x = "", y = "Frequency Ratio", fill = "NZV flag") +
theme_minimal()
Interpretation: Features with a frequency ratio above the red dashed line (19) and very few unique values are flagged as near-zero variance and should be dropped before modelling.
num_vars <- c("Gr_Liv_Area", "Total_Bsmt_SF", "Lot_Frontage",
"Mas_Vnr_Area", "Garage_Cars")
# Compute skewness before
skew_before <- sapply(ames_train[num_vars], function(x)
round(e1071::skewness(x, na.rm=TRUE), 3))
cat("Skewness BEFORE:\n"); print(skew_before)
## Skewness BEFORE:
## Gr_Liv_Area Total_Bsmt_SF Lot_Frontage Mas_Vnr_Area Garage_Cars
## 1.375 1.411 -0.092 2.625 -0.218
# Apply log1p to skewed features (skew > 0.5)
skew_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_of(num_vars[skew_before > 0.5]), offset = 1)
skew_baked <- bake(prep(skew_recipe, ames_train), new_data = ames_train)
skew_after <- sapply(skew_baked[num_vars], function(x)
round(e1071::skewness(x, na.rm=TRUE), 3))
cat("\nSkewness AFTER log1p:\n"); print(skew_after)
##
## Skewness AFTER log1p:
## Gr_Liv_Area Total_Bsmt_SF Lot_Frontage Mas_Vnr_Area Garage_Cars
## 0.051 -4.933 -0.092 0.528 -0.218
# Side-by-side histogram for Gr_Liv_Area
par(mfrow = c(1, 2))
hist(ames_train$Gr_Liv_Area, main="Gr_Liv_Area (before)",
col="#ff7b7b", border="white", xlab="")
hist(log1p(ames_train$Gr_Liv_Area), main="log1p(Gr_Liv_Area) (after)",
col="#7fffd4", border="white", xlab="")
par(mfrow = c(1,1))
z_score <- function(x) (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE) min_max <- function(x) (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE))
raw <- ames_train$Gr_Liv_Area z_sc <- z_score(raw) mm_sc <- min_max(raw)
cat(“Raw - mean:”, round(mean(raw),1), ” sd:“, round(sd(raw),1),”“) cat(”Z-score - mean:“, round(mean(z_sc),4),” sd:“, round(sd(z_sc),4),”“) cat(”Min-Max - range: [“, round(min(mm_sc),3),”,“, round(max(mm_sc),3),”]“)
par(mfrow = c(1, 3)) hist(raw, main=“Original Gr_Liv_Area”, col=“#6eb5ff”, border=“white”) hist(z_sc, main=“Z-score Standardized”, col=“#e8c97a”, border=“white”) hist(mm_sc, main=“Min-Max Scaled 0 to 1”, col=“#7fffd4”, border=“white”) par(mfrow = c(1, 1)) # Z-score standardization z_score <- function(x) (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
min_max <- function(x) (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE))
raw <- ames_train$Gr_Liv_Area z_sc <- z_score(raw) mm_sc <- min_max(raw)
cat(“Raw — mean:”, round(mean(raw),1), ” sd:“, round(sd(raw),1),”“) cat(”Z-score — mean:“, round(mean(z_sc),4),” sd:“, round(sd(z_sc),4),”“) cat(”Min-Max — range: [“, round(min(mm_sc),3),”,“, round(max(mm_sc),3),”]“)
par(mfrow = c(1, 3)) hist(raw, main=“Original Gr_Liv_Area”, col=“#6eb5ff”, border=“white”) hist(z_sc, main=“Z-score Standardized”, col=“#e8c97a”, border=“white”) hist(mm_sc, main=“Min-Max Scaled [0,1]”, col=“#7fffd4”, border=“white”) par(mfrow = c(1, 1)) `
Key takeaway: Standardization changes the scale of features, not their shape. Z-score is preferred for most ML models. Tree-based models (Random Forest, XGBoost) don’t need standardization at all.
install.packages(“prodlim”, repos = “https://cran.rstudio.com/”)
Generated with R Markdown. Reproduces HOML sections 3.1 to 3.5