HW 2 jourast

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) 
library(visdat)

library(caret)

## Loading required package: lattice

library(recipes)

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(AmesHousing)
library(rsample)

## Warning: package 'rsample' was built under R version 4.4.3

## 
## Attaching package: 'rsample'

## The following object is masked from 'package:caret':
## 
##     calibration

# Recreate the ames_train dataset from Chapter 2
set.seed(123)
ames <- make_ames()
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- training(split)

# Log transformation blueprint
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

# 3.3.2.1 Estimated statistic (Median Imputation)
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_median(Gr_Liv_Area)

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Median imputation for: Gr_Liv_Area

# 3.3.2.2 K-nearest neighbor imputation
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_impute_knn(all_predictors(), neighbors = 6)

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • K-nearest neighbor imputation for: all_predictors()

# Identifying near-zero variance features with caret
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>% 
  tibble::rownames_to_column() %>% 
  filter(nzv) %>%
  head(5)

##        rowname  freqRatio percentUnique zeroVar  nzv
## 1       Street  226.66667    0.09760859   FALSE TRUE
## 2        Alley   24.25316    0.14641288   FALSE TRUE
## 3 Land_Contour   19.50000    0.19521718   FALSE TRUE
## 4    Utilities 1023.00000    0.14641288   FALSE TRUE
## 5   Land_Slope   22.15909    0.14641288   FALSE TRUE

# Applying near-zero variance filtering in our recipe
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_nzv(all_predictors())

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Sparse, unbalanced variable filter on: all_predictors()

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Yeo-Johnson transformation on: all_numeric_predictors()

## • Centering for: all_numeric_predictors()

## • Scaling for: all_numeric_predictors()

HW 2 jourast

112035125

2026-03-08