1 Prerequisites

This chapter leverages the following packages:

# Helper packages
library(dplyr)     # for data manipulation
library(ggplot2)   # for awesome graphics

# Feature engineering packages
library(caret)     # for various ML tasks
library(recipes)   # for feature engineering steps
library(modeldata) # for datasets (attrition, etc.)

Data in this chapter.

# Ames housing data
ames <- AmesHousing::make_ames()

# Job attrition data
attrition <- modeldata::attrition

# Create a train/test split of Ames data
set.seed(123)
split  <- rsample::initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test  <- rsample::testing(split)

2 Target Engineering

Data preprocessing and engineering techniques generally refer to the addition, deletion, or transformation of data. Although this book focuses primarily on applying ML algorithms, feature engineering can make or break an algorithm’s predictive ability.

One common transformation applies to the response (target) variable. When a numeric response variable is right-skewed, log-transforming the target can lead to more normally distributed errors and often improves model performance.

2.1 Visualising the target distribution

# Skewed target
p1 <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Original Sale Price", x = "Sale Price", y = "Count")

# Log-transformed target
p2 <- ggplot(ames_train, aes(x = Sale_Price)) +
  geom_histogram(bins = 40, fill = "steelblue", colour = "white") +
  scale_x_log10(labels = scales::dollar) +
  labs(title = "Log-Transformed Sale Price", x = "Sale Price (log scale)", y = "Count")

gridExtra::grid.arrange(p1, p2, nrow = 1)
Distribution of Sale_Price before and after log transformation.

Distribution of Sale_Price before and after log transformation.

2.2 Applying a log transformation with recipes

The recipes package makes it straightforward to apply (and later reverse) a log transformation:

# Option 1: log transform the response directly
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(Sale_Price, base = 10)

ames_recipe

2.2.1 Using a Box-Cox transformation

When you are unsure whether a log transform is appropriate, a Box-Cox transformation estimates the best power transformation from the data:

ames_recipe_bc <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_BoxCox(Sale_Price)

ames_recipe_bc

Note: When transforming the target variable, remember to back-transform predictions before evaluating on the original scale.


3 Dealing with Missingness

Most real-world data contain missing values. Before imputing, it is important to visualise the missingness structure to understand whether values are missing completely at random (MCAR), missing at random (MAR), or missing not at random (MNAR).

3.1 Visualising missing values

# Use the raw Ames data which contains missing values
ames_raw <- AmesHousing::ames_raw
ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill = value)) +
    geom_raster() +
    scale_fill_grey(name = "Missing",
                    labels = c("Present", "Missing")) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(x = "Variables", y = "Observations",
         title = "Missing value heatmap – Raw Ames Data")
Heatmap of missing values in the raw Ames data. Each column is a variable; dark cells indicate a missing value.

Heatmap of missing values in the raw Ames data. Each column is a variable; dark cells indicate a missing value.

ames_raw %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  tidyr::pivot_longer(everything(),
                      names_to  = "variable",
                      values_to = "n_missing") %>%
  filter(n_missing > 0) %>%
  arrange(desc(n_missing)) %>%
  ggplot(aes(x = reorder(variable, n_missing), y = n_missing)) +
    geom_col(fill = "steelblue") +
    coord_flip() +
    labs(x = NULL, y = "Number of missing values",
         title = "Missing values by variable")
Number of missing values per variable.

Number of missing values per variable.

3.2 Imputation

Rather than removing observations with missing values (which wastes data), we can impute plausible replacement values.

3.2.1 Estimated statistic

The simplest approach replaces missing values with a central tendency statistic (mean, median, or mode).

ames_recipe_impute <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(Gr_Liv_Area)   # median imputation for a numeric predictor

ames_recipe_impute

3.2.2 K-nearest neighbour

KNN imputation predicts each missing value using the k most similar observations. It is more accurate than a single statistic but more computationally expensive.

ames_recipe_knn <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)

ames_recipe_knn

Suggested values for neighbors are 5–10 (Kuhn & Johnson 2019).

3.2.3 Tree-based imputation

Bagged decision trees offer a compromise between predictive accuracy and computational cost for imputation.

ames_recipe_bag <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_bag(all_predictors())

ames_recipe_bag

4 Feature Filtering

Features with near-zero or zero variance convey little (if any) information and can harm model performance. The recipes package provides step_nzv() to automatically remove such variables.

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors()) %>%
  prep(training = ames_train, retain = TRUE) %>%
  juice() %>%
  dim()
## [1] 2049   60

We can also inspect which variables are flagged as near-zero variance:

caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
  tibble::rownames_to_column("variable") %>%
  filter(nzv == TRUE) %>%
  arrange(desc(freqRatio))
##              variable  freqRatio percentUnique zeroVar  nzv
## 1           Pool_Area 2039.00000    0.53684724   FALSE TRUE
## 2           Utilities 1023.00000    0.14641288   FALSE TRUE
## 3     Low_Qual_Fin_SF 1010.50000    1.31771596   FALSE TRUE
## 4  Three_season_porch  673.66667    1.12249878   FALSE TRUE
## 5             Pool_QC  509.75000    0.24402147   FALSE TRUE
## 6        BsmtFin_SF_2  453.25000    9.37042460   FALSE TRUE
## 7              Street  226.66667    0.09760859   FALSE TRUE
## 8         Condition_2  202.60000    0.34163006   FALSE TRUE
## 9            Misc_Val  180.54545    1.56173743   FALSE TRUE
## 10       Screen_Porch  169.90909    4.63640800   FALSE TRUE
## 11          Roof_Matl  144.35714    0.39043436   FALSE TRUE
## 12            Heating  106.00000    0.29282577   FALSE TRUE
## 13     Enclosed_Porch  102.05882    7.41825281   FALSE TRUE
## 14         Functional   38.89796    0.39043436   FALSE TRUE
## 15       Misc_Feature   34.18966    0.24402147   FALSE TRUE
## 16     BsmtFin_Type_2   25.85294    0.34163006   FALSE TRUE
## 17              Alley   24.25316    0.14641288   FALSE TRUE
## 18         Land_Slope   22.15909    0.14641288   FALSE TRUE
## 19      Kitchen_AbvGr   21.23913    0.19521718   FALSE TRUE
## 20          Bsmt_Cond   20.24444    0.29282577   FALSE TRUE
## 21       Land_Contour   19.50000    0.19521718   FALSE TRUE

For highly correlated features, step_corr() can remove predictors above a correlation threshold:

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors()) %>%
  step_corr(all_numeric_predictors(), threshold = 0.9) %>%
  prep() %>%
  juice() %>%
  dim()
## [1] 2049   60

5 Numeric Feature Engineering

5.1 Skewness

Skewed numeric features can negatively affect certain models (especially linear models and neural networks). We can apply a Yeo-Johnson (works with zero and negative values) or Box-Cox transformation to reduce skewness.

ames_train %>%
  select(Lot_Area, Gr_Liv_Area, First_Flr_SF) %>%
  tidyr::pivot_longer(everything(), names_to = "feature", values_to = "value") %>%
  ggplot(aes(value)) +
    geom_histogram(bins = 35, fill = "steelblue", colour = "white") +
    facet_wrap(~feature, scales = "free") +
    labs(title = "Before transformation", x = NULL, y = "Count")
Distribution of several right-skewed numeric features.

Distribution of several right-skewed numeric features.

ames_recipe_yj <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors()) %>%
  step_YeoJohnson(all_numeric_predictors())

# Preview the transformation on Gr_Liv_Area
prep_yj <- prep(ames_recipe_yj, training = ames_train, retain = TRUE)

juice(prep_yj) %>%
  select(Lot_Area, Gr_Liv_Area, First_Flr_SF) %>%
  tidyr::pivot_longer(everything(), names_to = "feature", values_to = "value") %>%
  ggplot(aes(value)) +
    geom_histogram(bins = 35, fill = "darkorange", colour = "white") +
    facet_wrap(~feature, scales = "free") +
    labs(title = "After Yeo-Johnson transformation", x = NULL, y = "Count")

5.2 Standardization

Many algorithms (KNN, SVMs, regularised regression, neural networks) are sensitive to the scale of numeric features. Standardisation transforms features to have mean 0 and standard deviation 1.

ames_recipe_std <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors()) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())  # center + scale

ames_recipe_std

We can verify the effect:

prep_std <- prep(ames_recipe_std, training = ames_train, retain = TRUE)

juice(prep_std) %>%
  select(Gr_Liv_Area, Lot_Area, First_Flr_SF) %>%
  summarise(across(everything(), list(mean = mean, sd = sd)))
## # A tibble: 1 × 6
##   Gr_Liv_Area_mean Gr_Liv_Area_sd Lot_Area_mean Lot_Area_sd First_Flr_SF_mean
##              <dbl>          <dbl>         <dbl>       <dbl>             <dbl>
## 1         9.66e-16              1      5.37e-16           1          4.60e-16
## # ℹ 1 more variable: First_Flr_SF_sd <dbl>