library(AmesHousing)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(visdat)

ames <- make_ames()
library(AmesHousing)
library(caret)
## Loading required package: lattice
ames <- make_ames()

split <- createDataPartition(ames$Sale_Price, p = 0.8, list = FALSE)

ames_train <- ames[split, ]
transformed_response <- log(ames_train$Sale_Price)

summary(transformed_response)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.456  11.769  11.983  12.021  12.271  13.521
ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
log(-0.5)
## Warning in log(-0.5): NaNs produced
## [1] NaN
log1p(-0.5)
## [1] -0.6931472
y <- forecast::BoxCox(10, lambda = 0.5)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
y
## [1] 4.324555
## attr(,"lambda")
## [1] 0.5
inv_box_cox <- function(x, lambda) {
  
  if (lambda == 0) exp(x)
  else (lambda * x + 1)^(1/lambda)
  
}

inv_box_cox(y, 0.5)
## [1] 10
## attr(,"lambda")
## [1] 0.5
ames_raw <- AmesHousing::ames_raw

sum(is.na(ames_raw))
## [1] 13997
ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) +
  geom_raster() +
  coord_flip() +
  scale_fill_grey(name = "",
                  labels = c("Present", "Missing")) +
  xlab("Observation") +
  theme(axis.text.y = element_text(size = 4))

vis_miss(ames_raw, cluster = TRUE)

ames_recipe %>%
  step_impute_median(Gr_Liv_Area)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Median imputation for: Gr_Liv_Area
ames_recipe %>%
  step_impute_mode(all_nominal())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Mode imputation for: all_nominal()
ames_recipe %>%
  step_impute_knn(all_predictors(), neighbors = 6)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • K-nearest neighbor imputation for: all_predictors()
ames_recipe %>%
  step_impute_bag(all_predictors())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Bagged tree imputation for: all_predictors()
caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
  tibble::rownames_to_column() %>%
  dplyr::filter(nzv)
##               rowname  freqRatio percentUnique zeroVar  nzv
## 1              Street  292.25000    0.08525149   FALSE TRUE
## 2               Alley   22.27551    0.12787724   FALSE TRUE
## 3        Land_Contour   23.23077    0.17050298   FALSE TRUE
## 4           Utilities 1171.50000    0.12787724   FALSE TRUE
## 5          Land_Slope   21.00000    0.12787724   FALSE TRUE
## 6         Condition_2  211.09091    0.34100597   FALSE TRUE
## 7           Roof_Matl  128.38889    0.34100597   FALSE TRUE
## 8           Bsmt_Cond   20.52941    0.25575448   FALSE TRUE
## 9      BsmtFin_Type_2   24.64198    0.29838022   FALSE TRUE
## 10            Heating   92.24000    0.25575448   FALSE TRUE
## 11    Low_Qual_Fin_SF  578.25000    1.23614663   FALSE TRUE
## 12      Kitchen_AbvGr   21.95098    0.17050298   FALSE TRUE
## 13         Functional   39.00000    0.34100597   FALSE TRUE
## 14      Open_Porch_SF   24.95122    9.71867008   FALSE TRUE
## 15     Enclosed_Porch  109.66667    6.86274510   FALSE TRUE
## 16 Three_season_porch  771.33333    1.19352089   FALSE TRUE
## 17       Screen_Porch  238.00000    4.34782609   FALSE TRUE
## 18          Pool_Area 2335.00000    0.51150895   FALSE TRUE
## 19            Pool_QC  778.33333    0.21312873   FALSE TRUE
## 20       Misc_Feature   30.58108    0.21312873   FALSE TRUE
## 21           Misc_Val  161.85714    1.40664962   FALSE TRUE
ames_recipe %>%
  step_zv(all_predictors()) %>%
  step_nzv(all_predictors())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Zero variance filter on: all_predictors()
## • Sparse, unbalanced variable filter on: all_predictors()
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Yeo-Johnson transformation on: all_numeric_predictors()
ames_recipe %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes())
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 80
## 
## ── Operations
## • Log transformation on: all_outcomes()
## • Centering for: all_numeric() -all_outcomes()
## • Scaling for: all_numeric() -all_outcomes()