Reproducing outputs from Sections 3.1 to 3.5 of Hands-On Machine Learning with R.
# Helper packages
library(dplyr) # for data manipulation
library(ggplot2) # for awesome graphics
library(visdat) # for additional visualizations
# Feature engineering packages
library(caret) # for various ML tasks
library(recipes) # for feature engineering tasks
# Additional packages
library(purrr)
library(broom)
# Set plotting theme
ggplot2::theme_set(ggplot2::theme_light())
# Load Ames housing data
ames <- AmesHousing::make_ames()
# Stratified 70/30 split on Sale_Price
set.seed(123)
split <- rsample::initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test <- rsample::testing(split)
dim(ames_train)
## [1] 2049 81
dim(ames_test)
## [1] 881 81
head(ames_train$Sale_Price)
## [1] 105500 88000 120000 125000 67500 112000
Figure 3.1 – Comparing residuals from a non-log-transformed vs. a log-transformed model:
models <- c("Non-log transformed model residuals",
"Log transformed model residuals")
list(
m1 = lm(Sale_Price ~ Year_Built, data = ames_train),
m2 = lm(log(Sale_Price) ~ Year_Built, data = ames_train)
) %>%
map2_dfr(models, ~ broom::augment(.x) %>% mutate(model = .y)) %>%
ggplot(aes(.resid)) +
geom_histogram(bins = 75) +
facet_wrap(~ model, scales = "free_x") +
ylab(NULL) +
xlab("Residuals")
Figure 3.1: Residuals – non-log vs. log-transformed Sale_Price
Applying a log transformation to the response via
recipes:
# Direct log transform
transformed_response <- log(ames_train$Sale_Price)
head(transformed_response)
## [1] 11.56647 11.38509 11.69525 11.73607 11.11988 11.62625
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes())
ames_recipe
Demonstrating that log() of a negative number produces
NaN:
log(-0.5)
## [1] NaN
Box-Cox transformation via recipes:
ames_recipe_bc <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_BoxCox(all_outcomes())
ames_recipe_bc
ames_raw <- AmesHousing::ames_raw
ames_raw %>%
is.na() %>%
reshape2::melt() %>%
ggplot(aes(Var2, Var1, fill = value)) +
geom_raster() +
coord_flip() +
scale_y_continuous(NULL, expand = c(0, 0)) +
scale_fill_grey(name = "",
labels = c("Present", "Missing")) +
xlab("Observation") +
theme(axis.text.y = element_text(size = 4))
Figure 3.2: Missing value map for raw Ames data
visdat::vis_miss(ames_raw, cluster = TRUE)
Figure 3.3: vis_miss cluster plot
ames_recipe_med <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_median(Gr_Liv_Area)
ames_recipe_med
ames_recipe_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_knn(all_predictors(), neighbors = 6)
ames_recipe_knn
ames_recipe_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_bag(all_predictors())
ames_recipe_bag
# Identify near-zero variance features with caret
nzv_info <- caret::nearZeroVar(ames_train, saveMetrics = TRUE)
cat("Features flagged as near-zero variance:\n")
## Features flagged as near-zero variance:
print(nzv_info[nzv_info$nzv == TRUE, ])
## freqRatio percentUnique zeroVar nzv
## Street 226.66667 0.09760859 FALSE TRUE
## Alley 24.25316 0.14641288 FALSE TRUE
## Land_Contour 19.50000 0.19521718 FALSE TRUE
## Utilities 1023.00000 0.14641288 FALSE TRUE
## Land_Slope 22.15909 0.14641288 FALSE TRUE
## Condition_2 202.60000 0.34163006 FALSE TRUE
## Roof_Matl 144.35714 0.39043436 FALSE TRUE
## Bsmt_Cond 20.24444 0.29282577 FALSE TRUE
## BsmtFin_Type_2 25.85294 0.34163006 FALSE TRUE
## BsmtFin_SF_2 453.25000 9.37042460 FALSE TRUE
## Heating 106.00000 0.29282577 FALSE TRUE
## Low_Qual_Fin_SF 1010.50000 1.31771596 FALSE TRUE
## Kitchen_AbvGr 21.23913 0.19521718 FALSE TRUE
## Functional 38.89796 0.39043436 FALSE TRUE
## Enclosed_Porch 102.05882 7.41825281 FALSE TRUE
## Three_season_porch 673.66667 1.12249878 FALSE TRUE
## Screen_Porch 169.90909 4.63640800 FALSE TRUE
## Pool_Area 2039.00000 0.53684724 FALSE TRUE
## Pool_QC 509.75000 0.24402147 FALSE TRUE
## Misc_Feature 34.18966 0.24402147 FALSE TRUE
## Misc_Val 180.54545 1.56173743 FALSE TRUE
# Remove near-zero variance predictors via recipes
ames_recipe_nzv <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_predictors())
ames_recipe_nzv
# Compute skewness for all numeric features
skewed_features <- ames_train %>%
select(where(is.numeric)) %>%
summarise(across(everything(),
~ e1071::skewness(., na.rm = TRUE))) %>%
tidyr::pivot_longer(everything(),
names_to = "feature",
values_to = "skewness") %>%
arrange(desc(abs(skewness)))
cat("Top 10 most skewed features:\n")
## Top 10 most skewed features:
print(head(skewed_features, 10))
## # A tibble: 10 × 2
## feature skewness
## <chr> <dbl>
## 1 Misc_Val 20.7
## 2 Pool_Area 16.7
## 3 Lot_Area 13.0
## 4 Three_season_porch 11.3
## 5 Low_Qual_Fin_SF 10.6
## 6 Kitchen_AbvGr 4.45
## 7 Enclosed_Porch 4.40
## 8 BsmtFin_SF_2 4.16
## 9 Bsmt_Half_Bath 4.00
## 10 Screen_Porch 3.75
p1 <- ggplot(ames_train, aes(Gr_Liv_Area)) +
geom_histogram(bins = 50, fill = "steelblue", color = "white") +
ggtitle("Original: Gr_Liv_Area") +
xlab("Above grade living area (sq ft)")
p2 <- ggplot(ames_train, aes(log(Gr_Liv_Area))) +
geom_histogram(bins = 50, fill = "darkorange", color = "white") +
ggtitle("Log-transformed: Gr_Liv_Area") +
xlab("log(Gr_Liv_Area)")
gridExtra::grid.arrange(p1, p2, ncol = 2)
Figure 3.4: Gr_Liv_Area before and after log transformation
Apply Yeo-Johnson transformation to all numeric predictors:
ames_recipe_yj <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_YeoJohnson(all_numeric_predictors())
ames_recipe_yj
p3 <- ggplot(ames_train, aes(Year_Built)) +
geom_histogram(bins = 40, fill = "steelblue", color = "white") +
ggtitle("Before Standardization: Year_Built")
# Build, prep, and bake the standardization recipe
ames_recipe_std <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
ames_std_prep <- prep(ames_recipe_std, training = ames_train)
ames_std_baked <- bake(ames_std_prep, new_data = ames_train)
p4 <- ggplot(ames_std_baked, aes(Year_Built)) +
geom_histogram(bins = 40, fill = "darkorange", color = "white") +
ggtitle("After Standardization: Year_Built")
gridExtra::grid.arrange(p3, p4, ncol = 2)
Figure 3.5: Year_Built before and after standardization
cat("Summary of Year_Built after standardization:\n")
## Summary of Year_Built after standardization:
summary(ames_std_baked$Year_Built)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.29356 -0.61093 0.05145 0.00000 0.97878 1.27685
End of Homework 2 – Sections 3.1 to 3.5 of HOML Chapter 3.