3.1 Prerequisites

library(dplyr)
library(ggplot2)
library(visdat)
library(caret)
library(recipes)
library(AmesHousing)
library(reshape2)

Load Ames Housing dataset.

ames <- make_ames()
set.seed(123)
train_index <- createDataPartition(ames$Sale_Price, p = 0.7, list = FALSE)
ames_train <- ames[train_index, ]
ames_test  <- ames[-train_index, ]

3.2 Target Engineering

Check skewness of Sale Price.

ggplot(ames_train, aes(Sale_Price)) +
  geom_histogram(bins = 30, fill="steelblue") +
  labs(title="Distribution of Sale Price")

Log transform the response.

transformed_response <- log(ames_train$Sale_Price)

Create recipe with log transformation.

ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())
ames_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()

Undo log transformation example.

y <- log(10)
exp(y)
## [1] 10

3.3 Dealing with Missingness

Check missing values in raw Ames data.

sum(is.na(AmesHousing::ames_raw))
## [1] 13997

3.3.1 Visualizing Missing Values

Heatmap of missing values.

AmesHousing::ames_raw %>%
  is.na() %>%
  melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) +
  geom_raster() +
  coord_flip() +
  scale_fill_grey(labels=c("Present","Missing")) +
  xlab("Observation")

Using visdat visualization.

vis_miss(AmesHousing::ames_raw, cluster = TRUE)

Check garage example.

AmesHousing::ames_raw %>%
  filter(is.na(`Garage Type`)) %>%
  select(`Garage Type`, `Garage Cars`, `Garage Area`)
## # A tibble: 157 × 3
##    `Garage Type` `Garage Cars` `Garage Area`
##    <chr>                 <int>         <int>
##  1 <NA>                      0             0
##  2 <NA>                      0             0
##  3 <NA>                      0             0
##  4 <NA>                      0             0
##  5 <NA>                      0             0
##  6 <NA>                      0             0
##  7 <NA>                      0             0
##  8 <NA>                      0             0
##  9 <NA>                      0             0
## 10 <NA>                      0             0
## # ℹ 147 more rows

3.3.2 Imputation

Median Imputation

ames_recipe %>%
  step_impute_median(Gr_Liv_Area)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Median imputation for: Gr_Liv_Area

KNN Imputation

ames_recipe %>%
  step_impute_knn(all_predictors(), neighbors = 6)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • K-nearest neighbor imputation for: all_predictors()

Tree-Based Imputation

ames_recipe %>%
  step_impute_bag(all_predictors())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Bagged tree imputation for: all_predictors()

3.4 Feature Filtering

Detect near-zero variance predictors.

caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
  tibble::rownames_to_column() %>%
  filter(nzv)
##               rowname  freqRatio percentUnique zeroVar  nzv
## 1              Street  170.08333    0.09741841   FALSE TRUE
## 2               Alley   23.03614    0.14612762   FALSE TRUE
## 3        Land_Contour   20.69663    0.19483682   FALSE TRUE
## 4           Utilities 1025.00000    0.14612762   FALSE TRUE
## 5          Land_Slope   21.67778    0.14612762   FALSE TRUE
## 6         Condition_2  254.37500    0.29225524   FALSE TRUE
## 7           Roof_Matl  184.54545    0.38967365   FALSE TRUE
## 8           Bsmt_Cond   20.72727    0.29225524   FALSE TRUE
## 9      BsmtFin_Type_2   23.75676    0.34096444   FALSE TRUE
## 10       BsmtFin_SF_2  453.25000    9.59571359   FALSE TRUE
## 11            Heating  100.85000    0.29225524   FALSE TRUE
## 12    Low_Qual_Fin_SF  505.50000    1.36385777   FALSE TRUE
## 13      Kitchen_AbvGr   21.27174    0.19483682   FALSE TRUE
## 14         Functional   36.65385    0.38967365   FALSE TRUE
## 15     Enclosed_Porch  124.35714    7.54992694   FALSE TRUE
## 16 Three_season_porch  675.33333    1.16902094   FALSE TRUE
## 17       Screen_Porch  208.11111    4.67608378   FALSE TRUE
## 18          Pool_Area 2042.00000    0.58451047   FALSE TRUE
## 19            Pool_QC  510.50000    0.24354603   FALSE TRUE
## 20       Misc_Feature   34.31034    0.24354603   FALSE TRUE
## 21           Misc_Val  221.44444    1.46127618   FALSE TRUE

Add filtering steps to recipe.

ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes()) %>%
  step_nzv(all_predictors())
ames_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Sparse, unbalanced variable filter on: all_predictors()

3.5 Numeric Feature Engineering

Example numeric feature scaling.

ames_recipe_num <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes()) %>%
  step_normalize(all_numeric_predictors())
ames_recipe_num
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Centering and scaling for: all_numeric_predictors()