Untitled

library(AmesHousing)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

library(recipes)

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(visdat)

ames <- make_ames()

library(AmesHousing)
library(caret)

## Loading required package: lattice

ames <- make_ames()

split <- createDataPartition(ames$Sale_Price, p = 0.8, list = FALSE)

ames_train <- ames[split, ]

transformed_response <- log(ames_train$Sale_Price)

summary(transformed_response)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.456  11.769  11.983  12.021  12.271  13.521

ames_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_log(all_outcomes())

ames_recipe

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

log(-0.5)

## Warning in log(-0.5): NaNs produced

## [1] NaN

log1p(-0.5)

## [1] -0.6931472

y <- forecast::BoxCox(10, lambda = 0.5)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## [1] 4.324555
## attr(,"lambda")
## [1] 0.5

inv_box_cox <- function(x, lambda) {
  
  if (lambda == 0) exp(x)
  else (lambda * x + 1)^(1/lambda)
  
}

inv_box_cox(y, 0.5)

## [1] 10
## attr(,"lambda")
## [1] 0.5

ames_raw <- AmesHousing::ames_raw

sum(is.na(ames_raw))

## [1] 13997

ames_raw %>%
  is.na() %>%
  reshape2::melt() %>%
  ggplot(aes(Var2, Var1, fill=value)) +
  geom_raster() +
  coord_flip() +
  scale_fill_grey(name = "",
                  labels = c("Present", "Missing")) +
  xlab("Observation") +
  theme(axis.text.y = element_text(size = 4))

vis_miss(ames_raw, cluster = TRUE)

ames_recipe %>%
  step_impute_median(Gr_Liv_Area)

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

## • Median imputation for: Gr_Liv_Area

ames_recipe %>%
  step_impute_mode(all_nominal())

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

## • Mode imputation for: all_nominal()

ames_recipe %>%
  step_impute_knn(all_predictors(), neighbors = 6)

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

## • K-nearest neighbor imputation for: all_predictors()

ames_recipe %>%
  step_impute_bag(all_predictors())

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

## • Bagged tree imputation for: all_predictors()

caret::nearZeroVar(ames_train, saveMetrics = TRUE) %>%
  tibble::rownames_to_column() %>%
  dplyr::filter(nzv)

##               rowname  freqRatio percentUnique zeroVar  nzv
## 1              Street  292.25000    0.08525149   FALSE TRUE
## 2               Alley   22.27551    0.12787724   FALSE TRUE
## 3        Land_Contour   23.23077    0.17050298   FALSE TRUE
## 4           Utilities 1171.50000    0.12787724   FALSE TRUE
## 5          Land_Slope   21.00000    0.12787724   FALSE TRUE
## 6         Condition_2  211.09091    0.34100597   FALSE TRUE
## 7           Roof_Matl  128.38889    0.34100597   FALSE TRUE
## 8           Bsmt_Cond   20.52941    0.25575448   FALSE TRUE
## 9      BsmtFin_Type_2   24.64198    0.29838022   FALSE TRUE
## 10            Heating   92.24000    0.25575448   FALSE TRUE
## 11    Low_Qual_Fin_SF  578.25000    1.23614663   FALSE TRUE
## 12      Kitchen_AbvGr   21.95098    0.17050298   FALSE TRUE
## 13         Functional   39.00000    0.34100597   FALSE TRUE
## 14      Open_Porch_SF   24.95122    9.71867008   FALSE TRUE
## 15     Enclosed_Porch  109.66667    6.86274510   FALSE TRUE
## 16 Three_season_porch  771.33333    1.19352089   FALSE TRUE
## 17       Screen_Porch  238.00000    4.34782609   FALSE TRUE
## 18          Pool_Area 2335.00000    0.51150895   FALSE TRUE
## 19            Pool_QC  778.33333    0.21312873   FALSE TRUE
## 20       Misc_Feature   30.58108    0.21312873   FALSE TRUE
## 21           Misc_Val  161.85714    1.40664962   FALSE TRUE

ames_recipe %>%
  step_zv(all_predictors()) %>%
  step_nzv(all_predictors())

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

## • Zero variance filter on: all_predictors()

## • Sparse, unbalanced variable filter on: all_predictors()

recipe(Sale_Price ~ ., data = ames_train) %>%
  step_YeoJohnson(all_numeric_predictors())

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Yeo-Johnson transformation on: all_numeric_predictors()

ames_recipe %>%
  step_center(all_numeric(), -all_outcomes()) %>%
  step_scale(all_numeric(), -all_outcomes())

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 80

##

## ── Operations

## • Log transformation on: all_outcomes()

## • Centering for: all_numeric() -all_outcomes()

## • Scaling for: all_numeric() -all_outcomes()

Untitled

Faiz Haikal_114035108

2026-03-08