install.packages(c(“dplyr”, “ggplot2”, “visdat”, “caret”, “recipes”, “AmesHousing”, “reshape2”, “forecast”))
library(dplyr) # for data manipulation library(ggplot2) # for awesome graphics library(visdat) # for additional visualizations library(caret) # for various ML tasks library(recipes) # for feature engineering tasks library(AmesHousing) # for Ames housing data library(reshape2) # for melting data library(forecast) # for Box-Cox transformations
ames_raw <- AmesHousing::ames_raw # Raw data for missingness examples ames_train <- make_ames() # Processed training data
transformed_response <- log(ames_train$Sale_Price) head(transformed_response)
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>% step_log(all_outcomes()) ames_recipe
log(-0.5) # NaN log1p(-0.5) # -0.6931472
lambda <- -0.03616899 # Example lambda from book y <- BoxCox(10, lambda) inv_box_cox <- function(x, lambda) { if (lambda == 0) exp(x) else (lambda * x + 1)^(1/lambda) } inv_box_cox(y, lambda)
ggplot(ames_train, aes(x = Sale_Price)) + geom_histogram(bins = 30) + ggtitle(“Original Sale_Price Distribution”) + theme_bw()
ggplot(ames_train, aes(x = log(Sale_Price))) + geom_histogram(bins = 30) + ggtitle(“Log-Transformed Sale_Price Distribution”) + theme_bw()
sum(is.na(ames_raw))
ames_raw %>% is.na() %>% reshape2::melt() %>% ggplot(aes(Var2, Var1, fill = value)) + geom_raster() + coord_flip() + scale_y_continuous(NULL, expand = c(0, 0)) + scale_fill_grey(name = ““, labels = c(”Present”, “Missing”)) + xlab(“Observation”) + theme(axis.text.y = element_text(size = 4))
ames_raw %>% filter(is.na(Garage Type)) %>%
select(Garage Type, Garage Cars,
Garage Area)
vis_miss(ames_raw, cluster = TRUE)
ames_recipe %>% step_medianimpute(Gr_Liv_Area)
ames_recipe %>% step_knnimpute(all_predictors(), neighbors = 6)
ames_recipe %>% step_bagimpute(all_predictors())
prep_knn <- prep(ames_recipe %>% step_knnimpute(all_predictors(), neighbors = 6), training = ames_train) baked_data <- bake(prep_knn, new_data = ames_train) sum(is.na(baked_data))
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>% tibble::rownames_to_column() %>% filter(nzv)
ames_recipe %>% step_zv(all_predictors()) %>% step_nzv(all_predictors())
recipe(Sale_Price ~ ., data = ames_train) %>% step_YeoJohnson(all_numeric())
ames_recipe %>% step_center(all_numeric(), -all_outcomes()) %>% step_scale(all_numeric(), -all_outcomes())
summary(ames_train[, c(“Gr_Liv_Area”, “Year_Built”)])
prep_std <- prep(ames_recipe %>% step_center(all_numeric(), -all_outcomes()) %>% step_scale(all_numeric(), -all_outcomes()), training = ames_train) baked_std <- bake(prep_std, new_data = ames_train) summary(baked_std[, c(“Gr_Liv_Area”, “Year_Built”)])