library(dplyr)    
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) 
library(visdat)  
library(caret)    
## Loading required package: lattice
library(recipes)  
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(AmesHousing)
library(rsample)
## Warning: package 'rsample' was built under R version 4.4.3
## 
## Attaching package: 'rsample'
## The following object is masked from 'package:caret':
## 
##     calibration
# Recreate the ames_train dataset from Chapter 2
set.seed(123)
ames <- make_ames()
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- training(split)

# Log transformation blueprint
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
# 3.3.2.1 Estimated statistic (Median Imputation)
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(Gr_Liv_Area)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Median imputation for: Gr_Liv_Area
# 3.3.2.2 K-nearest neighbor imputation
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • K-nearest neighbor imputation for: all_predictors()
# Identifying near-zero variance features with caret
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>% 
  tibble::rownames_to_column() %>% 
  filter(nzv) %>%
  head(5)
##        rowname  freqRatio percentUnique zeroVar  nzv
## 1       Street  226.66667    0.09760859   FALSE TRUE
## 2        Alley   24.25316    0.14641288   FALSE TRUE
## 3 Land_Contour   19.50000    0.19521718   FALSE TRUE
## 4    Utilities 1023.00000    0.14641288   FALSE TRUE
## 5   Land_Slope   22.15909    0.14641288   FALSE TRUE
# Applying near-zero variance filtering in our recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Sparse, unbalanced variable filter on: all_predictors()
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Yeo-Johnson transformation on: all_numeric_predictors()
## • Centering for: all_numeric_predictors()
## • Scaling for: all_numeric_predictors()