Prerequisites

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.0     ✔ recipes      1.0.1
## ✔ dials        1.0.0     ✔ rsample      1.1.0
## ✔ dplyr        1.0.9     ✔ tibble       3.1.8
## ✔ ggplot2      3.3.6     ✔ tidyr        1.2.0
## ✔ infer        1.0.3     ✔ tune         1.0.1
## ✔ modeldata    1.0.1     ✔ workflows    1.1.0
## ✔ parsnip      1.0.2     ✔ workflowsets 1.0.0
## ✔ purrr        0.3.4     ✔ yardstick    1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ✔ stringr 1.4.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
library(here)
## here() starts at C:/Users/danya/OneDrive - University of Cincinnati/R/DataMining

Part 1: Feature Engineering

#1

path <- here('data', 'boston.csv')
boston <- read_csv(path)
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
boston_split <- initial_split(boston, 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)

lml <- linear_reg() %>%
  fit(cmedv ~ ., boston_train)  

lml %>%
  predict(boston_test) %>%  
  bind_cols(select(boston_test, cmedv)) %>% 
  rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83

#2

boston_recipe <- recipe(cmedv ~ ., boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_YeoJohnson(all_numeric_predictors()) 

bostom_workflow <- workflow() %>%
  add_model(linear_reg()) %>%  
  add_recipe(boston_recipe) 

lml <- bostom_workflow %>%
  fit(boston_train)  

lml %>%
  predict(boston_test) %>%  
  bind_cols(select(boston_test, cmedv)) %>% 
  rmse(truth = cmedv, estimate = .pred) 
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.56

#3

ames <- AmesHousing::make_ames()
set.seed(123) 
split <- initial_split(ames, prop = 0.7, strata = Sale_Price)
ames_train <- training(split)
ames_test <- testing(split)

mlr_recipe <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_other(all_nominal_predictors(), threshold = 0.01, other = "other")

mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

mlr_fit <- mlr_wflow %>%
  fit(data = ames_train)

mlr_fit %>%
  predict(ames_test) %>%
  bind_cols(ames_test %>% select(Sale_Price)) %>%
  rmse(truth = Sale_Price, estimate = .pred)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      25917.
rm(list = ls())

Part 2: Resampling

#1

path <- here('data', 'Advertising.csv')
adv <- read_csv(path)
## Rows: 200 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): TV, radio, newspaper, sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
split <- initial_split(adv, 0.7, strata = sales)
train <- training(split)
test <- testing(split)

#2

set.seed(123)
kfold <- vfold_cv(train, v = 10, strata = sales)

mlr_recipe <- recipe(sales ~ ., TV + radio + newspaper, data = train)

mlr_wflow <- workflow() %>%
  add_model(linear_reg()) %>%
  add_recipe(mlr_recipe)

mlr_fit <- mlr_wflow %>%
  fit_resamples(kfold)

#3

collect_metrics(mlr_fit)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.65     10  0.154  Preprocessor1_Model1
## 2 rsq     standard   0.913    10  0.0166 Preprocessor1_Model1

#4

collect_metrics(mlr_fit, summarize = FALSE) %>%
  filter(.metric == 'rmse') %>%
  summarise(rng = range(.estimate))
## # A tibble: 2 × 1
##     rng
##   <dbl>
## 1 0.975
## 2 2.59

#5

set.seed(123)
bs_sample <- bootstraps(train, times = 10, strata = sales)

bs_fit <- mlr_wflow %>%
  fit_resamples(bs_sample)  

#6

collect_metrics(bs_fit)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.83     10  0.0973 Preprocessor1_Model1
## 2 rsq     standard   0.885    10  0.0102 Preprocessor1_Model1

#7

collect_metrics(bs_fit, summarize = F) %>%
  filter(.metric == 'rmse') %>%
  summarise(rng = range(.estimate))
## # A tibble: 2 × 1
##     rng
##   <dbl>
## 1  1.42
## 2  2.51