“Live with your data before you plunge into modeling.”
— Leo Breiman
This report reproduces the outputs from Sections 3.1 –
3.5 of
Hands-On
Machine Learning with R.
library(ggplot2)
library(dplyr)
library(recipes)
library(rsample)
library(modeldata)
library(e1071)
library(tidyr)
library(gridExtra)
library(scales)
select <- dplyr::select
# Load Ames Housing dataset
data(ames)
cat("Dataset dimensions:", nrow(ames), "rows x", ncol(ames), "columns\n")## Dataset dimensions: 2930 rows x 74 columns
# 70/30 stratified train/test split
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
cat("Training set:", nrow(ames_train), "rows\n")## Training set: 2049 rows
## Test set: 881 rows
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129500 160000 180923 213500 755000
Skewed target variables can hurt model performance. We compare three
approaches: original, log(), and log10() via
recipes.
skew_orig <- round(skewness(ames_train$Sale_Price), 3)
skew_log <- round(skewness(log(ames_train$Sale_Price)), 3)
rec_log <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(Sale_Price, base = 10)
ames_log <- bake(prep(rec_log, ames_train), new_data = ames_train)
skew_log10 <- round(skewness(ames_log$Sale_Price), 3)
cat("Skewness — Original :", skew_orig, "\n")## Skewness — Original : 1.671
## Skewness — log() : -0.096
## Skewness — log10() : -0.096
p1 <- ggplot(ames_train, aes(x = Sale_Price)) +
geom_histogram(bins = 50, fill = "steelblue", color = "white") +
scale_x_continuous(labels = dollar_format()) +
labs(title = paste0("(a) Original\nskewness = ", skew_orig),
x = "Sale Price", y = "Count") +
theme_bw()
p2 <- ggplot(ames_train, aes(x = log(Sale_Price))) +
geom_histogram(bins = 50, fill = "salmon", color = "white") +
labs(title = paste0("(b) log() Transformed\nskewness = ", skew_log),
x = "log(Sale Price)", y = "Count") +
theme_bw()
p3 <- ggplot(ames_log, aes(x = Sale_Price)) +
geom_histogram(bins = 50, fill = "mediumseagreen", color = "white") +
labs(title = paste0("(c) log10 via recipes\nskewness = ", skew_log10),
x = "log10(Sale Price)", y = "Count") +
theme_bw()
grid.arrange(p1, p2, p3, ncol = 3)Result: Log transformation reduces skewness from 1.671 to -0.096, making the target distribution much more symmetric.
# modeldata::ames has no NAs — introducing artificial missingness for demonstration
set.seed(42)
ames_miss <- ames_train
ames_miss$Lot_Frontage[sample(nrow(ames_miss), 330)] <- NA
ames_miss$Garage_Cars[sample(nrow(ames_miss), 150)] <- NA
ames_miss$Mas_Vnr_Area[sample(nrow(ames_miss), 80)] <- NAmissing_count <- colSums(is.na(ames_miss))
missing_df <- data.frame(
feature = names(missing_count),
n_missing = as.integer(missing_count),
pct_missing = round(missing_count / nrow(ames_miss) * 100, 1)
) %>%
filter(n_missing > 0) %>%
arrange(desc(pct_missing))
missing_df## feature n_missing pct_missing
## Lot_Frontage Lot_Frontage 330 16.1
## Garage_Cars Garage_Cars 150 7.3
## Mas_Vnr_Area Mas_Vnr_Area 80 3.9
ggplot(missing_df, aes(x = reorder(feature, pct_missing), y = pct_missing)) +
geom_col(fill = "tomato") +
coord_flip() +
labs(title = "Missing Values per Feature",
x = "Feature", y = "% Missing") +
theme_bw()samp <- ames_miss[sample(nrow(ames_miss), 300), missing_df$feature]
miss_mat <- as.data.frame(is.na(samp)) %>%
mutate(row_id = row_number()) %>%
pivot_longer(-row_id, names_to = "feature", values_to = "missing")
ggplot(miss_mat, aes(x = row_id, y = feature, fill = missing)) +
geom_tile() +
scale_fill_manual(values = c("FALSE" = "#d4e6f1", "TRUE" = "#e74c3c"),
labels = c("Observed", "Missing")) +
labs(title = "Missingness Pattern (300 row sample)",
x = "Row (sample)", y = "Feature", fill = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())We compare three imputation strategies on
Lot_Frontage:
# Mean imputation
rec_mean <- recipe(Sale_Price ~ ., data = ames_miss) %>%
step_impute_mean(Lot_Frontage, Garage_Cars, Mas_Vnr_Area)
ames_mean_imp <- bake(prep(rec_mean, ames_miss), new_data = ames_miss)
# KNN imputation
rec_knn <- recipe(Sale_Price ~ ., data = ames_miss) %>%
step_impute_knn(Lot_Frontage, Garage_Cars, Mas_Vnr_Area, neighbors = 6)
ames_knn_imp <- bake(prep(rec_knn, ames_miss), new_data = ames_miss)
# Bagged tree imputation
rec_bag <- recipe(Sale_Price ~ ., data = ames_miss) %>%
step_impute_bag(Lot_Frontage, Garage_Cars, Mas_Vnr_Area)
ames_bag_imp <- bake(prep(rec_bag, ames_miss), new_data = ames_miss)
cat("NAs remaining after mean imputation: ", sum(is.na(ames_mean_imp$Lot_Frontage)), "\n")## NAs remaining after mean imputation: 0
## NAs remaining after KNN imputation: 0
## NAs remaining after bagged imputation: 0
imp_df <- bind_rows(
data.frame(value = ames_miss$Lot_Frontage[!is.na(ames_miss$Lot_Frontage)],
method = "Original (no NA)"),
data.frame(value = ames_mean_imp$Lot_Frontage, method = "Mean Imputed"),
data.frame(value = ames_knn_imp$Lot_Frontage, method = "KNN Imputed"),
data.frame(value = ames_bag_imp$Lot_Frontage, method = "Bagged Tree Imputed")
) %>%
mutate(method = factor(method,
levels = c("Original (no NA)", "Mean Imputed", "KNN Imputed", "Bagged Tree Imputed")))
ggplot(imp_df, aes(x = value, fill = method)) +
geom_histogram(bins = 40, color = "white", alpha = 0.85) +
facet_wrap(~method, ncol = 2) +
labs(title = "Imputation Methods Comparison: Lot_Frontage",
x = "Lot Frontage", y = "Count") +
theme_bw() +
theme(legend.position = "none")Note: Mean imputation creates a spike at the mean value (visible in the histogram), while KNN and Bagged Tree methods preserve the original distribution shape better.
Near-zero variance (NZV) features add noise and increase computation.
We use step_nzv().
rec_nzv <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_predictors())
ames_nzv <- bake(prep(rec_nzv, ames_train), new_data = ames_train)
removed_nzv <- setdiff(names(ames_train), names(ames_nzv))
cat("Features before NZV filter:", ncol(ames_train) - 1, "\n")## Features before NZV filter: 73
## Features after NZV filter: 53
## Removed features:
## [1] "Street" "Alley" "Land_Contour"
## [4] "Utilities" "Land_Slope" "Condition_2"
## [7] "Roof_Matl" "Bsmt_Cond" "BsmtFin_Type_2"
## [10] "BsmtFin_SF_2" "Heating" "Kitchen_AbvGr"
## [13] "Functional" "Enclosed_Porch" "Three_season_porch"
## [16] "Screen_Porch" "Pool_Area" "Pool_QC"
## [19] "Misc_Feature" "Misc_Val"
factor_cols <- names(ames_train)[sapply(ames_train, is.factor)]
freq_ratios <- sapply(factor_cols, function(col) {
tbl <- sort(table(ames_train[[col]]), decreasing = TRUE)
if (length(tbl) < 2) return(Inf)
as.numeric(tbl[1]) / as.numeric(tbl[2])
})
fr_df <- data.frame(feature = names(freq_ratios), freq_ratio = freq_ratios) %>%
arrange(desc(freq_ratio)) %>%
head(20)
ggplot(fr_df, aes(x = reorder(feature, freq_ratio), y = freq_ratio)) +
geom_col(fill = "steelblue") +
geom_hline(yintercept = 19, color = "red", linetype = "dashed", linewidth = 1) +
coord_flip() +
labs(title = "Near-Zero Variance: Frequency Ratio per Feature\n(Red dashed = default NZV threshold = 19)",
x = "Feature", y = "Frequency Ratio") +
theme_bw()Result: 20 features were removed as near-zero variance predictors, reducing the feature space from 73 to 53.
Many numeric predictors are right-skewed. We apply log and Box-Cox transformations.
num_cols <- names(ames_train)[sapply(ames_train, is.numeric)]
num_cols <- setdiff(num_cols, "Sale_Price")
skew_vals <- sapply(num_cols, function(col) skewness(ames_train[[col]], na.rm = TRUE))
skew_df <- data.frame(feature = num_cols, skewness = round(skew_vals, 3)) %>%
arrange(desc(abs(skewness)))
cat("Top 10 most skewed numeric features:\n")## Top 10 most skewed numeric features:
## feature skewness
## Misc_Val Misc_Val 20.724
## Pool_Area Pool_Area 16.664
## Lot_Area Lot_Area 12.976
## Three_season_porch Three_season_porch 11.268
## Kitchen_AbvGr Kitchen_AbvGr 4.448
## Enclosed_Porch Enclosed_Porch 4.400
## BsmtFin_SF_2 BsmtFin_SF_2 4.157
## Bsmt_Half_Bath Bsmt_Half_Bath 4.002
## Screen_Porch Screen_Porch 3.754
## Open_Porch_SF Open_Porch_SF 2.713
orig_skew <- round(skewness(ames_train$Lot_Area), 2)
log_skew <- round(skewness(log(ames_train$Lot_Area)), 2)
rec_bc <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_median(all_numeric_predictors()) %>%
step_BoxCox(Lot_Area)
ames_bc <- bake(prep(rec_bc, ames_train), new_data = ames_train)
bc_skew <- round(skewness(ames_bc$Lot_Area), 2)
p_s1 <- ggplot(ames_train, aes(x = Lot_Area)) +
geom_histogram(bins = 40, fill = "steelblue", color = "white") +
labs(title = paste0("(a) Original\nskewness = ", orig_skew),
x = "Lot Area", y = "Count") + theme_bw()
p_s2 <- ggplot(ames_train, aes(x = log(Lot_Area))) +
geom_histogram(bins = 40, fill = "salmon", color = "white") +
labs(title = paste0("(b) Log Transformed\nskewness = ", log_skew),
x = "log(Lot Area)", y = "Count") + theme_bw()
p_s3 <- ggplot(ames_bc, aes(x = Lot_Area)) +
geom_histogram(bins = 40, fill = "mediumseagreen", color = "white") +
labs(title = paste0("(c) Box-Cox Transformed\nskewness = ", bc_skew),
x = "Box-Cox(Lot Area)", y = "Count") + theme_bw()
grid.arrange(p_s1, p_s2, p_s3, ncol = 3)Result: Box-Cox reduces
Lot_Areaskewness from 12.98 → 0.09, nearly achieving a normal distribution.
Features on different scales can bias distance-based algorithms. We
use step_center() + step_scale().
## --- Before standardization ---
ames_train %>%
select(all_of(demo_feats)) %>%
summarise(across(everything(),
list(mean = ~round(mean(.x, na.rm=TRUE), 1),
sd = ~round(sd(.x, na.rm=TRUE), 1))))## # A tibble: 1 × 6
## Year_Built_mean Year_Built_sd Gr_Liv_Area_mean Gr_Liv_Area_sd Lot_Area_mean
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1971. 30.2 1504. 518. 10163.
## # ℹ 1 more variable: Lot_Area_sd <dbl>
rec_std <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_median(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
ames_std <- bake(prep(rec_std, ames_train), new_data = ames_train)
cat("--- After standardization (mean ~ 0, SD ~ 1) ---\n")## --- After standardization (mean ~ 0, SD ~ 1) ---
ames_std %>%
select(all_of(demo_feats)) %>%
summarise(across(everything(),
list(mean = ~round(mean(.x, na.rm=TRUE), 4),
sd = ~round(sd(.x, na.rm=TRUE), 4))))## # A tibble: 1 × 6
## Year_Built_mean Year_Built_sd Gr_Liv_Area_mean Gr_Liv_Area_sd Lot_Area_mean
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 1 0
## # ℹ 1 more variable: Lot_Area_sd <dbl>
before_long <- ames_train %>%
select(all_of(demo_feats)) %>%
pivot_longer(everything(), names_to = "feature", values_to = "value") %>%
mutate(type = "Original")
after_long <- ames_std %>%
select(all_of(demo_feats)) %>%
pivot_longer(everything(), names_to = "feature", values_to = "value") %>%
mutate(type = "Standardized")
both_long <- bind_rows(before_long, after_long) %>%
mutate(type = factor(type, levels = c("Original", "Standardized")))
ggplot(both_long, aes(x = value, fill = type)) +
geom_histogram(bins = 35, color = "white", alpha = 0.85) +
facet_grid(type ~ feature, scales = "free") +
scale_fill_manual(values = c("Original" = "steelblue",
"Standardized" = "darkorange")) +
labs(title = "Standardization: Before vs After",
x = "Value", y = "Count") +
theme_bw() +
theme(legend.position = "none", strip.text = element_text(size = 9))Result: After standardization, all features have mean = 0 and SD = 1, making them comparable regardless of original scale.
| Section | Topic | Key Technique |
|---|---|---|
| 3.1 | Prerequisites | rsample::initial_split() |
| 3.2 | Target Engineering | step_log(), skewness reduction |
| 3.3 | Missingness | step_impute_mean/knn/bag() |
| 3.4 | Feature Filtering | step_nzv() — removed 20 features |
| 3.5 | Numeric Engineering | step_BoxCox(), step_center(),
step_scale() |
Rendered with R 4.4.1 · Source: HOML Chapter 3