library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(visdat)
library(caret)
## Loading required package: lattice
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(AmesHousing)
library(rsample)
## Warning: package 'rsample' was built under R version 4.4.3
##
## Attaching package: 'rsample'
## The following object is masked from 'package:caret':
##
## calibration
# Recreate the ames_train dataset from Chapter 2
set.seed(123)
ames <- make_ames()
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
# Log transformation blueprint
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes())
ames_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
# 3.3.2.1 Estimated statistic (Median Imputation)
recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_median(Gr_Liv_Area)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Median imputation for: Gr_Liv_Area
# 3.3.2.2 K-nearest neighbor imputation
recipe(Sale_Price ~ ., data = ames_train) %>%
step_impute_knn(all_predictors(), neighbors = 6)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • K-nearest neighbor imputation for: all_predictors()
# Identifying near-zero variance features with caret
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
tibble::rownames_to_column() %>%
filter(nzv) %>%
head(5)
## rowname freqRatio percentUnique zeroVar nzv
## 1 Street 226.66667 0.09760859 FALSE TRUE
## 2 Alley 24.25316 0.14641288 FALSE TRUE
## 3 Land_Contour 19.50000 0.19521718 FALSE TRUE
## 4 Utilities 1023.00000 0.14641288 FALSE TRUE
## 5 Land_Slope 22.15909 0.14641288 FALSE TRUE
# Applying near-zero variance filtering in our recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
step_nzv(all_predictors())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Sparse, unbalanced variable filter on: all_predictors()
recipe(Sale_Price ~ ., data = ames_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Yeo-Johnson transformation on: all_numeric_predictors()
## • Centering for: all_numeric_predictors()
## • Scaling for: all_numeric_predictors()