This document reproduces sections 3.1 to 3.5 of Hands-On Machine Learning with R. Book link: https://bradleyboehmke.github.io/HOML/engineering.html
install.packages(c("AmesHousing","rsample","recipes","dplyr","ggplot2",
"visdat","naniar","caret","e1071","tidyr","prodlim"),
repos = "https://cran.rstudio.com/", quiet = TRUE)
## package 'AmesHousing' successfully unpacked and MD5 sums checked
## package 'rsample' successfully unpacked and MD5 sums checked
## package 'recipes' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'visdat' successfully unpacked and MD5 sums checked
## package 'naniar' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## package 'e1071' successfully unpacked and MD5 sums checked
## package 'tidyr' successfully unpacked and MD5 sums checked
## package 'prodlim' successfully unpacked and MD5 sums checked
library(AmesHousing)
library(rsample)
library(recipes)
library(dplyr)
library(ggplot2)
library(visdat)
library(naniar)
library(caret)
library(e1071)
ames <- make_ames()
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
cat("Training set:", nrow(ames_train), "rows x", ncol(ames_train), "columns\n")
## Training set: 2049 rows x 81 columns
cat("Test set: ", nrow(ames_test), "rows x", ncol(ames_test), "columns\n")
## Test set: 881 rows x 81 columns
head(ames_train[, 1:6])
## # A tibble: 6 × 6
## MS_SubClass MS_Zoning Lot_Frontage Lot_Area Street Alley
## <fct> <fct> <dbl> <int> <fct> <fct>
## 1 Two_Story_PUD_1946_and_Newer Residential_M… 21 1680 Pave No_A…
## 2 Two_Story_PUD_1946_and_Newer Residential_M… 21 1680 Pave No_A…
## 3 One_Story_PUD_1946_and_Newer Residential_L… 24 2280 Pave No_A…
## 4 One_Story_PUD_1946_and_Newer Residential_L… 50 7175 Pave No_A…
## 5 One_Story_1945_and_Older Residential_H… 70 9800 Pave No_A…
## 6 Duplex_All_Styles_and_Ages Residential_M… 68 8930 Pave No_A…
library(MASS)
cat("Skewness (original):", round(skewness(ames_train$Sale_Price), 3), "\n")
## Skewness (original): 1.671
log_price <- log(ames_train$Sale_Price)
cat("Skewness (log): ", round(skewness(log_price), 3), "\n")
## Skewness (log): -0.096
bc <- boxcox(Sale_Price ~ 1, data = ames_train, lambda = seq(-2, 2, 0.1))
lam_opt <- bc$x[which.max(bc$y)]
cat("Box-Cox optimal lambda:", round(lam_opt, 4), "\n")
## Box-Cox optimal lambda: 0.0606
par(mfrow = c(1, 3), bg = "white")
hist(ames_train$Sale_Price,
main = "Original Sale_Price", col = "#6eb5ff", border = "white", xlab = "")
hist(log_price,
main = "log(Sale_Price)", col = "#7fffd4", border = "white", xlab = "")
hist((ames_train$Sale_Price^lam_opt - 1) / lam_opt,
main = paste0("Box-Cox lambda=", round(lam_opt, 2)),
col = "#e8c97a", border = "white", xlab = "")
par(mfrow = c(1, 1))
vis_miss(ames_train, warn_large_data = FALSE)
miss_summary <- data.frame(
Variable = names(ames_train),
Pct_Missing = sapply(ames_train, function(x) mean(is.na(x)) * 100)
)
miss_summary <- miss_summary[miss_summary$Pct_Missing > 0, ]
miss_summary <- miss_summary[order(-miss_summary$Pct_Missing), ]
print(miss_summary)
## [1] Variable Pct_Missing
## <0 rows> (or 0-length row.names)
impute_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_median(Lot_Frontage) %>%
step_impute_knn(all_numeric_predictors(), neighbors = 5) %>%
step_impute_mode(all_nominal_predictors())
imputed <- prep(impute_recipe, training = ames_train)
ames_imputed <- bake(imputed, new_data = ames_train)
cat("Missing values BEFORE:", sum(is.na(ames_train)), "\n")
## Missing values BEFORE: 0
cat("Missing values AFTER: ", sum(is.na(ames_imputed)), "\n")
## Missing values AFTER: 0
nzv_cols <- nearZeroVar(ames_train, saveMetrics = TRUE)
nzv_flag <- nzv_cols[nzv_cols$nzv == TRUE, ]
cat("Near-zero variance features:", nrow(nzv_flag), "\n")
## Near-zero variance features: 21
print(head(nzv_flag[order(-nzv_flag$freqRatio), ], 8))
## freqRatio percentUnique zeroVar nzv
## Pool_Area 2039.0000 0.53684724 FALSE TRUE
## Utilities 1023.0000 0.14641288 FALSE TRUE
## Low_Qual_Fin_SF 1010.5000 1.31771596 FALSE TRUE
## Three_season_porch 673.6667 1.12249878 FALSE TRUE
## Pool_QC 509.7500 0.24402147 FALSE TRUE
## BsmtFin_SF_2 453.2500 9.37042460 FALSE TRUE
## Street 226.6667 0.09760859 FALSE TRUE
## Condition_2 202.6000 0.34163006 FALSE TRUE
nzv_top <- data.frame(
Feature = rownames(nzv_cols),
FreqRatio = nzv_cols$freqRatio,
NZV = nzv_cols$nzv
)
nzv_top <- head(nzv_top[order(-nzv_top$FreqRatio), ], 10)
ggplot(nzv_top, aes(x = reorder(Feature, FreqRatio), y = FreqRatio, fill = NZV)) +
geom_col() +
geom_hline(yintercept = 19, linetype = "dashed", color = "red") +
coord_flip() +
scale_fill_manual(values = c("FALSE" = "#6eb5ff", "TRUE" = "#ff7b7b")) +
labs(title = "3.4 Near-Zero Variance - Top 10 Features",
x = "", y = "Frequency Ratio", fill = "NZV Flag") +
theme_minimal()
num_vars <- c("Gr_Liv_Area", "Total_Bsmt_SF", "Lot_Frontage",
"Mas_Vnr_Area", "Garage_Cars")
skew_before <- sapply(ames_train[num_vars],
function(x) round(skewness(x, na.rm = TRUE), 3))
cat("Skewness BEFORE:\n")
## Skewness BEFORE:
print(skew_before)
## Gr_Liv_Area Total_Bsmt_SF Lot_Frontage Mas_Vnr_Area Garage_Cars
## 1.375 1.411 -0.092 2.625 -0.218
skew_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(Gr_Liv_Area, Total_Bsmt_SF, Mas_Vnr_Area, offset = 1)
skew_baked <- bake(prep(skew_recipe, ames_train), new_data = ames_train)
skew_after <- sapply(skew_baked[num_vars],
function(x) round(skewness(x, na.rm = TRUE), 3))
cat("\nSkewness AFTER log1p:\n")
##
## Skewness AFTER log1p:
print(skew_after)
## Gr_Liv_Area Total_Bsmt_SF Lot_Frontage Mas_Vnr_Area Garage_Cars
## 0.051 -4.933 -0.092 0.528 -0.218
par(mfrow = c(1, 2))
hist(ames_train$Gr_Liv_Area,
main = "Gr_Liv_Area (before)", col = "#ff7b7b", border = "white", xlab = "")
hist(log1p(ames_train$Gr_Liv_Area),
main = "log1p(Gr_Liv_Area) (after)", col = "#7fffd4", border = "white", xlab = "")
par(mfrow = c(1, 1))
z_score <- function(x) (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
min_max <- function(x) (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
raw <- ames_train$Gr_Liv_Area
z_sc <- z_score(raw)
mm_sc <- min_max(raw)
cat("Raw - mean:", round(mean(raw), 1), " sd:", round(sd(raw), 1), "\n")
## Raw - mean: 1504.4 sd: 518.4
cat("Z-score - mean:", round(mean(z_sc), 4), " sd:", round(sd(z_sc), 4), "\n")
## Z-score - mean: 0 sd: 1
cat("Min-Max - min:", round(min(mm_sc), 3), " max:", round(max(mm_sc), 3), "\n")
## Min-Max - min: 0 max: 1
par(mfrow = c(1, 3))
hist(raw, main = "Original Gr_Liv_Area", col = "#6eb5ff", border = "white", xlab = "")
hist(z_sc, main = "Z-score Standardized", col = "#e8c97a", border = "white", xlab = "")
hist(mm_sc, main = "Min-Max Scaled 0 to 1", col = "#7fffd4", border = "white", xlab = "")
par(mfrow = c(1, 1))
Generated with R Markdown. Reproduces HOML sections 3.1 to 3.5