library(dplyr)
library(ggplot2)
library(visdat)
library(caret)
library(recipes)
library(AmesHousing)
library(reshape2)
Load Ames Housing dataset.
ames <- make_ames()
set.seed(123)
train_index <- createDataPartition(ames$Sale_Price, p = 0.7, list = FALSE)
ames_train <- ames[train_index, ]
ames_test <- ames[-train_index, ]
Check skewness of Sale Price.
ggplot(ames_train, aes(Sale_Price)) +
geom_histogram(bins = 30, fill="steelblue") +
labs(title="Distribution of Sale Price")
Log transform the response.
transformed_response <- log(ames_train$Sale_Price)
Create recipe with log transformation.
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes())
ames_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
Undo log transformation example.
y <- log(10)
exp(y)
## [1] 10
Check missing values in raw Ames data.
sum(is.na(AmesHousing::ames_raw))
## [1] 13997
Heatmap of missing values.
AmesHousing::ames_raw %>%
is.na() %>%
melt() %>%
ggplot(aes(Var2, Var1, fill=value)) +
geom_raster() +
coord_flip() +
scale_fill_grey(labels=c("Present","Missing")) +
xlab("Observation")
Using visdat visualization.
vis_miss(AmesHousing::ames_raw, cluster = TRUE)
Check garage example.
AmesHousing::ames_raw %>%
filter(is.na(`Garage Type`)) %>%
select(`Garage Type`, `Garage Cars`, `Garage Area`)
## # A tibble: 157 × 3
## `Garage Type` `Garage Cars` `Garage Area`
## <chr> <int> <int>
## 1 <NA> 0 0
## 2 <NA> 0 0
## 3 <NA> 0 0
## 4 <NA> 0 0
## 5 <NA> 0 0
## 6 <NA> 0 0
## 7 <NA> 0 0
## 8 <NA> 0 0
## 9 <NA> 0 0
## 10 <NA> 0 0
## # ℹ 147 more rows
ames_recipe %>%
step_impute_median(Gr_Liv_Area)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Median imputation for: Gr_Liv_Area
ames_recipe %>%
step_impute_knn(all_predictors(), neighbors = 6)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • K-nearest neighbor imputation for: all_predictors()
ames_recipe %>%
step_impute_bag(all_predictors())
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Bagged tree imputation for: all_predictors()
Detect near-zero variance predictors.
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
tibble::rownames_to_column() %>%
filter(nzv)
## rowname freqRatio percentUnique zeroVar nzv
## 1 Street 170.08333 0.09741841 FALSE TRUE
## 2 Alley 23.03614 0.14612762 FALSE TRUE
## 3 Land_Contour 20.69663 0.19483682 FALSE TRUE
## 4 Utilities 1025.00000 0.14612762 FALSE TRUE
## 5 Land_Slope 21.67778 0.14612762 FALSE TRUE
## 6 Condition_2 254.37500 0.29225524 FALSE TRUE
## 7 Roof_Matl 184.54545 0.38967365 FALSE TRUE
## 8 Bsmt_Cond 20.72727 0.29225524 FALSE TRUE
## 9 BsmtFin_Type_2 23.75676 0.34096444 FALSE TRUE
## 10 BsmtFin_SF_2 453.25000 9.59571359 FALSE TRUE
## 11 Heating 100.85000 0.29225524 FALSE TRUE
## 12 Low_Qual_Fin_SF 505.50000 1.36385777 FALSE TRUE
## 13 Kitchen_AbvGr 21.27174 0.19483682 FALSE TRUE
## 14 Functional 36.65385 0.38967365 FALSE TRUE
## 15 Enclosed_Porch 124.35714 7.54992694 FALSE TRUE
## 16 Three_season_porch 675.33333 1.16902094 FALSE TRUE
## 17 Screen_Porch 208.11111 4.67608378 FALSE TRUE
## 18 Pool_Area 2042.00000 0.58451047 FALSE TRUE
## 19 Pool_QC 510.50000 0.24354603 FALSE TRUE
## 20 Misc_Feature 34.31034 0.24354603 FALSE TRUE
## 21 Misc_Val 221.44444 1.46127618 FALSE TRUE
Add filtering steps to recipe.
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes()) %>%
step_nzv(all_predictors())
ames_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Sparse, unbalanced variable filter on: all_predictors()
Example numeric feature scaling.
ames_recipe_num <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_log(all_outcomes()) %>%
step_normalize(all_numeric_predictors())
ames_recipe_num
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 80
##
## ── Operations
## • Log transformation on: all_outcomes()
## • Centering and scaling for: all_numeric_predictors()