library(AmesHousing)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(visdat)
ames <- make_ames()
library(AmesHousing)
library(caret)
## Loading required package: lattice
ames <- make_ames()
split <- createDataPartition(ames$Sale_Price, p = 0.8, list = FALSE)
ames_train <- ames[split, ]
transformed_response <- log(ames_train$Sale_Price)
summary(transformed_response)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.456 11.769 11.983 12.021 12.271 13.521
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes())
ames_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
log(-0.5)
## Warning in log(-0.5): NaNs produced
## [1] NaN
log1p(-0.5)
## [1] -0.6931472
y <- forecast::BoxCox(10, lambda = 0.5)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
y
## [1] 4.324555
## attr(,"lambda")
## [1] 0.5
inv_box_cox <- function(x, lambda) {
if (lambda == 0) exp(x)
else (lambda * x + 1)^(1/lambda)
}
inv_box_cox(y, 0.5)
## [1] 10
## attr(,"lambda")
## [1] 0.5
ames_raw <- AmesHousing::ames_raw
sum(is.na(ames_raw))
## [1] 13997
ames_raw %>%
is.na() %>%
reshape2::melt() %>%
ggplot(aes(Var2, Var1, fill=value)) +
geom_raster() +
coord_flip() +
scale_fill_grey(name = "",
labels = c("Present", "Missing")) +
xlab("Observation") +
theme(axis.text.y = element_text(size = 4))

vis_miss(ames_raw, cluster = TRUE)

ames_recipe %>%
step_impute_median(Gr_Liv_Area)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Median imputation for: Gr_Liv_Area
ames_recipe %>%
step_impute_mode(all_nominal())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Mode imputation for: all_nominal()
ames_recipe %>%
step_impute_knn(all_predictors(), neighbors = 6)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • K-nearest neighbor imputation for: all_predictors()
ames_recipe %>%
step_impute_bag(all_predictors())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Bagged tree imputation for: all_predictors()
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
tibble::rownames_to_column() %>%
dplyr::filter(nzv)
## rowname freqRatio percentUnique zeroVar nzv
## 1 Street 292.25000 0.08525149 FALSE TRUE
## 2 Alley 22.27551 0.12787724 FALSE TRUE
## 3 Land_Contour 23.23077 0.17050298 FALSE TRUE
## 4 Utilities 1171.50000 0.12787724 FALSE TRUE
## 5 Land_Slope 21.00000 0.12787724 FALSE TRUE
## 6 Condition_2 211.09091 0.34100597 FALSE TRUE
## 7 Roof_Matl 128.38889 0.34100597 FALSE TRUE
## 8 Bsmt_Cond 20.52941 0.25575448 FALSE TRUE
## 9 BsmtFin_Type_2 24.64198 0.29838022 FALSE TRUE
## 10 Heating 92.24000 0.25575448 FALSE TRUE
## 11 Low_Qual_Fin_SF 578.25000 1.23614663 FALSE TRUE
## 12 Kitchen_AbvGr 21.95098 0.17050298 FALSE TRUE
## 13 Functional 39.00000 0.34100597 FALSE TRUE
## 14 Open_Porch_SF 24.95122 9.71867008 FALSE TRUE
## 15 Enclosed_Porch 109.66667 6.86274510 FALSE TRUE
## 16 Three_season_porch 771.33333 1.19352089 FALSE TRUE
## 17 Screen_Porch 238.00000 4.34782609 FALSE TRUE
## 18 Pool_Area 2335.00000 0.51150895 FALSE TRUE
## 19 Pool_QC 778.33333 0.21312873 FALSE TRUE
## 20 Misc_Feature 30.58108 0.21312873 FALSE TRUE
## 21 Misc_Val 161.85714 1.40664962 FALSE TRUE
ames_recipe %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Zero variance filter on: all_predictors()
## • Sparse, unbalanced variable filter on: all_predictors()
recipe(Sale_Price ~ ., data = ames_train) %>%
step_YeoJohnson(all_numeric_predictors())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Yeo-Johnson transformation on: all_numeric_predictors()
ames_recipe %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Centering for: all_numeric() -all_outcomes()
## • Scaling for: all_numeric() -all_outcomes()