install.packages(c(“dplyr”, “ggplot2”, “visdat”, “caret”, “recipes”, “AmesHousing”, “reshape2”, “forecast”))

Load packages

library(dplyr) # for data manipulation library(ggplot2) # for awesome graphics library(visdat) # for additional visualizations library(caret) # for various ML tasks library(recipes) # for feature engineering tasks library(AmesHousing) # for Ames housing data library(reshape2) # for melting data library(forecast) # for Box-Cox transformations

Load data

ames_raw <- AmesHousing::ames_raw # Raw data for missingness examples ames_train <- make_ames() # Processed training data

Manual log transformation (for illustration)

transformed_response <- log(ames_train$Sale_Price) head(transformed_response)

Using recipes to avoid data leakage

ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>% step_log(all_outcomes()) ames_recipe

Handling non-positive values

log(-0.5) # NaN log1p(-0.5) # -0.6931472

Box-Cox inverse example

lambda <- -0.03616899 # Example lambda from book y <- BoxCox(10, lambda) inv_box_cox <- function(x, lambda) { if (lambda == 0) exp(x) else (lambda * x + 1)^(1/lambda) } inv_box_cox(y, lambda)

Histograms of distributions (approximating Figure 3.1/3.2)

ggplot(ames_train, aes(x = Sale_Price)) + geom_histogram(bins = 30) + ggtitle(“Original Sale_Price Distribution”) + theme_bw()

ggplot(ames_train, aes(x = log(Sale_Price))) + geom_histogram(bins = 30) + ggtitle(“Log-Transformed Sale_Price Distribution”) + theme_bw()

Count total missing values

sum(is.na(ames_raw))

Missingness heatmap (approximating Figure 3.3)

ames_raw %>% is.na() %>% reshape2::melt() %>% ggplot(aes(Var2, Var1, fill = value)) + geom_raster() + coord_flip() + scale_y_continuous(NULL, expand = c(0, 0)) + scale_fill_grey(name = ““, labels = c(”Present”, “Missing”)) + xlab(“Observation”) + theme(axis.text.y = element_text(size = 4))

Inspect specific missingness patterns

ames_raw %>% filter(is.na(Garage Type)) %>% select(Garage Type, Garage Cars, Garage Area)

Clustered missingness visualization (approximating Figure 3.4)

vis_miss(ames_raw, cluster = TRUE)

Imputation examples using recipes

Median imputation

ames_recipe %>% step_medianimpute(Gr_Liv_Area)

KNN imputation

ames_recipe %>% step_knnimpute(all_predictors(), neighbors = 6)

Bagged tree imputation

ames_recipe %>% step_bagimpute(all_predictors())

Example: Apply KNN imputation and check

prep_knn <- prep(ames_recipe %>% step_knnimpute(all_predictors(), neighbors = 6), training = ames_train) baked_data <- bake(prep_knn, new_data = ames_train) sum(is.na(baked_data))

Identify near-zero variance features

caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>% tibble::rownames_to_column() %>% filter(nzv)

Add filtering steps to recipe

ames_recipe %>% step_zv(all_predictors()) %>% step_nzv(all_predictors())

Yeo-Johnson transformation for skewness

recipe(Sale_Price ~ ., data = ames_train) %>% step_YeoJohnson(all_numeric())

Standardization (center and scale)

ames_recipe %>% step_center(all_numeric(), -all_outcomes()) %>% step_scale(all_numeric(), -all_outcomes())

Example: Before and after standardization

Before

summary(ames_train[, c(“Gr_Liv_Area”, “Year_Built”)])

After (prep and bake)

prep_std <- prep(ames_recipe %>% step_center(all_numeric(), -all_outcomes()) %>% step_scale(all_numeric(), -all_outcomes()), training = ames_train) baked_std <- bake(prep_std, new_data = ames_train) summary(baked_std[, c(“Gr_Liv_Area”, “Year_Built”)])